--- /dev/null
+From bb65f89b7d3d305c14951f49860711fbcae70692 Mon Sep 17 00:00:00 2001
+From: Roman Gushchin <guro@fb.com>
+Date: Sat, 24 Aug 2019 17:54:50 -0700
+Subject: mm: memcontrol: flush percpu vmevents before releasing memcg
+
+From: Roman Gushchin <guro@fb.com>
+
+commit bb65f89b7d3d305c14951f49860711fbcae70692 upstream.
+
+Similar to vmstats, percpu caching of local vmevents leads to an
+accumulation of errors on non-leaf levels. This happens because some
+leftovers may remain in percpu caches, so that they are never propagated
+up by the cgroup tree and just disappear into nonexistence with on
+releasing of the memory cgroup.
+
+To fix this issue let's accumulate and propagate percpu vmevents values
+before releasing the memory cgroup similar to what we're doing with
+vmstats.
+
+Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate
+only over online cpus.
+
+Link: http://lkml.kernel.org/r/20190819202338.363363-4-guro@fb.com
+Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
+Signed-off-by: Roman Gushchin <guro@fb.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c | 22 +++++++++++++++++++++-
+ 1 file changed, 21 insertions(+), 1 deletion(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -3185,6 +3185,25 @@ static void memcg_flush_percpu_vmstats(s
+ }
+ }
+
++static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
++{
++ unsigned long events[NR_VM_EVENT_ITEMS];
++ struct mem_cgroup *mi;
++ int cpu, i;
++
++ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
++ events[i] = 0;
++
++ for_each_online_cpu(cpu)
++ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
++ events[i] += raw_cpu_read(
++ memcg->vmstats_percpu->events[i]);
++
++ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
++ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
++ atomic_long_add(events[i], &mi->vmevents[i]);
++}
++
+ #ifdef CONFIG_MEMCG_KMEM
+ static int memcg_online_kmem(struct mem_cgroup *memcg)
+ {
+@@ -4587,10 +4606,11 @@ static void __mem_cgroup_free(struct mem
+ int node;
+
+ /*
+- * Flush percpu vmstats to guarantee the value correctness
++ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg);
++ memcg_flush_percpu_vmevents(memcg);
+ for_each_node(node)
+ free_mem_cgroup_per_node_info(memcg, node);
+ free_percpu(memcg->vmstats_percpu);
--- /dev/null
+From c350a99ea2b1b666c28948d74ab46c16913c28a7 Mon Sep 17 00:00:00 2001
+From: Roman Gushchin <guro@fb.com>
+Date: Sat, 24 Aug 2019 17:54:47 -0700
+Subject: mm: memcontrol: flush percpu vmstats before releasing memcg
+
+From: Roman Gushchin <guro@fb.com>
+
+commit c350a99ea2b1b666c28948d74ab46c16913c28a7 upstream.
+
+Percpu caching of local vmstats with the conditional propagation by the
+cgroup tree leads to an accumulation of errors on non-leaf levels.
+
+Let's imagine two nested memory cgroups A and A/B. Say, a process
+belonging to A/B allocates 100 pagecache pages on the CPU 0. The percpu
+cache will spill 3 times, so that 32*3=96 pages will be accounted to A/B
+and A atomic vmstat counters, 4 pages will remain in the percpu cache.
+
+Imagine A/B is nearby memory.max, so that every following allocation
+triggers a direct reclaim on the local CPU. Say, each such attempt will
+free 16 pages on a new cpu. That means every percpu cache will have -16
+pages, except the first one, which will have 4 - 16 = -12. A/B and A
+atomic counters will not be touched at all.
+
+Now a user removes A/B. All percpu caches are freed and corresponding
+vmstat numbers are forgotten. A has 96 pages more than expected.
+
+As memory cgroups are created and destroyed, errors do accumulate. Even
+1-2 pages differences can accumulate into large numbers.
+
+To fix this issue let's accumulate and propagate percpu vmstat values
+before releasing the memory cgroup. At this point these numbers are
+stable and cannot be changed.
+
+Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate
+only over online cpus.
+
+Link: http://lkml.kernel.org/r/20190819202338.363363-2-guro@fb.com
+Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
+Signed-off-by: Roman Gushchin <guro@fb.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c | 40 ++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 40 insertions(+)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -3150,6 +3150,41 @@ static u64 mem_cgroup_read_u64(struct cg
+ }
+ }
+
++static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
++{
++ unsigned long stat[MEMCG_NR_STAT];
++ struct mem_cgroup *mi;
++ int node, cpu, i;
++
++ for (i = 0; i < MEMCG_NR_STAT; i++)
++ stat[i] = 0;
++
++ for_each_online_cpu(cpu)
++ for (i = 0; i < MEMCG_NR_STAT; i++)
++ stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]);
++
++ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
++ for (i = 0; i < MEMCG_NR_STAT; i++)
++ atomic_long_add(stat[i], &mi->vmstats[i]);
++
++ for_each_node(node) {
++ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
++ struct mem_cgroup_per_node *pi;
++
++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
++ stat[i] = 0;
++
++ for_each_online_cpu(cpu)
++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
++ stat[i] += raw_cpu_read(
++ pn->lruvec_stat_cpu->count[i]);
++
++ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
++ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
++ }
++}
++
+ #ifdef CONFIG_MEMCG_KMEM
+ static int memcg_online_kmem(struct mem_cgroup *memcg)
+ {
+@@ -4551,6 +4586,11 @@ static void __mem_cgroup_free(struct mem
+ {
+ int node;
+
++ /*
++ * Flush percpu vmstats to guarantee the value correctness
++ * on parent's and all ancestor levels.
++ */
++ memcg_flush_percpu_vmstats(memcg);
+ for_each_node(node)
+ free_mem_cgroup_per_node_info(memcg, node);
+ free_percpu(memcg->vmstats_percpu);
--- /dev/null
+From cd961038381f392b364a7c4a040f4576ca415b1a Mon Sep 17 00:00:00 2001
+From: David Rientjes <rientjes@google.com>
+Date: Sat, 24 Aug 2019 17:54:40 -0700
+Subject: mm, page_alloc: move_freepages should not examine struct page of reserved memory
+
+From: David Rientjes <rientjes@google.com>
+
+commit cd961038381f392b364a7c4a040f4576ca415b1a upstream.
+
+After commit 907ec5fca3dc ("mm: zero remaining unavailable struct
+pages"), struct page of reserved memory is zeroed. This causes
+page->flags to be 0 and fixes issues related to reading
+/proc/kpageflags, for example, of reserved memory.
+
+The VM_BUG_ON() in move_freepages_block(), however, assumes that
+page_zone() is meaningful even for reserved memory. That assumption is
+no longer true after the aforementioned commit.
+
+There's no reason why move_freepages_block() should be testing the
+legitimacy of page_zone() for reserved memory; its scope is limited only
+to pages on the zone's freelist.
+
+Note that pfn_valid() can be true for reserved memory: there is a
+backing struct page. The check for page_to_nid(page) is also buggy but
+reserved memory normally only appears on node 0 so the zeroing doesn't
+affect this.
+
+Move the debug checks to after verifying PageBuddy is true. This
+isolates the scope of the checks to only be for buddy pages which are on
+the zone's freelist which move_freepages_block() is operating on. In
+this case, an incorrect node or zone is a bug worthy of being warned
+about (and the examination of struct page is acceptable bcause this
+memory is not reserved).
+
+Why does move_freepages_block() gets called on reserved memory? It's
+simply math after finding a valid free page from the per-zone free area
+to use as fallback. We find the beginning and end of the pageblock of
+the valid page and that can bring us into memory that was reserved per
+the e820. pfn_valid() is still true (it's backed by a struct page), but
+since it's zero'd we shouldn't make any inferences here about comparing
+its node or zone. The current node check just happens to succeed most
+of the time by luck because reserved memory typically appears on node 0.
+
+The fix here is to validate that we actually have buddy pages before
+testing if there's any type of zone or node strangeness going on.
+
+We noticed it almost immediately after bringing 907ec5fca3dc in on
+CONFIG_DEBUG_VM builds. It depends on finding specific free pages in
+the per-zone free area where the math in move_freepages() will bring the
+start or end pfn into reserved memory and wanting to claim that entire
+pageblock as a new migratetype. So the path will be rare, require
+CONFIG_DEBUG_VM, and require fallback to a different migratetype.
+
+Some struct pages were already zeroed from reserve pages before
+907ec5fca3c so it theoretically could trigger before this commit. I
+think it's rare enough under a config option that most people don't run
+that others may not have noticed. I wouldn't argue against a stable tag
+and the backport should be easy enough, but probably wouldn't single out
+a commit that this is fixing.
+
+Mel said:
+
+: The overhead of the debugging check is higher with this patch although
+: it'll only affect debug builds and the path is not particularly hot.
+: If this was a concern, I think it would be reasonable to simply remove
+: the debugging check as the zone boundaries are checked in
+: move_freepages_block and we never expect a zone/node to be smaller than
+: a pageblock and stuck in the middle of another zone.
+
+Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1908122036560.10779@chino.kir.corp.google.com
+Signed-off-by: David Rientjes <rientjes@google.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c | 19 ++++---------------
+ 1 file changed, 4 insertions(+), 15 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2167,27 +2167,12 @@ static int move_freepages(struct zone *z
+ unsigned int order;
+ int pages_moved = 0;
+
+-#ifndef CONFIG_HOLES_IN_ZONE
+- /*
+- * page_zone is not safe to call in this context when
+- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
+- * anyway as we check zone boundaries in move_freepages_block().
+- * Remove at a later date when no bug reports exist related to
+- * grouping pages by mobility
+- */
+- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
+- pfn_valid(page_to_pfn(end_page)) &&
+- page_zone(start_page) != page_zone(end_page));
+-#endif
+ for (page = start_page; page <= end_page;) {
+ if (!pfn_valid_within(page_to_pfn(page))) {
+ page++;
+ continue;
+ }
+
+- /* Make sure we are not inadvertently changing nodes */
+- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+-
+ if (!PageBuddy(page)) {
+ /*
+ * We assume that pages that could be isolated for
+@@ -2202,6 +2187,10 @@ static int move_freepages(struct zone *z
+ continue;
+ }
+
++ /* Make sure we are not inadvertently changing nodes */
++ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
++ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
++
+ order = page_order(page);
+ move_to_free_area(page, &zone->free_area[order], migratetype);
+ page += 1 << order;
--- /dev/null
+From f7da677bc6e72033f0981b9d58b5c5d409fa641e Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Sat, 24 Aug 2019 17:54:59 -0700
+Subject: mm, page_owner: handle THP splits correctly
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit f7da677bc6e72033f0981b9d58b5c5d409fa641e upstream.
+
+THP splitting path is missing the split_page_owner() call that
+split_page() has.
+
+As a result, split THP pages are wrongly reported in the page_owner file
+as order-9 pages. Furthermore when the former head page is freed, the
+remaining former tail pages are not listed in the page_owner file at
+all. This patch fixes that by adding the split_page_owner() call into
+__split_huge_page().
+
+Link: http://lkml.kernel.org/r/20190820131828.22684-2-vbabka@suse.cz
+Fixes: a9627bc5e34e ("mm/page_owner: introduce split_page_owner and replace manual handling")
+Reported-by: Kirill A. Shutemov <kirill@shutemov.name>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -32,6 +32,7 @@
+ #include <linux/shmem_fs.h>
+ #include <linux/oom.h>
+ #include <linux/numa.h>
++#include <linux/page_owner.h>
+
+ #include <asm/tlb.h>
+ #include <asm/pgalloc.h>
+@@ -2500,6 +2501,9 @@ static void __split_huge_page(struct pag
+ }
+
+ ClearPageCompound(head);
++
++ split_page_owner(head, HPAGE_PMD_ORDER);
++
+ /* See comment in __split_huge_page_tail() */
+ if (PageAnon(head)) {
+ /* Additional pin to swap cache */
dm-zoned-properly-handle-backing-device-failure.patch
genirq-properly-pair-kobject_del-with-kobject_add.patch
mm-z3fold.c-fix-race-between-migration-and-destruction.patch
+mm-page_alloc-move_freepages-should-not-examine-struct-page-of-reserved-memory.patch
+mm-memcontrol-flush-percpu-vmstats-before-releasing-memcg.patch
+mm-memcontrol-flush-percpu-vmevents-before-releasing-memcg.patch
+mm-page_owner-handle-thp-splits-correctly.patch