From: Greg Kroah-Hartman Date: Mon, 26 Aug 2019 16:30:55 +0000 (+0200) Subject: 5.2-stable patches X-Git-Tag: v4.14.141~26 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ab8cf3ce2f8511590f26916ddad9657712053b60;p=thirdparty%2Fkernel%2Fstable-queue.git 5.2-stable patches added patches: mm-memcontrol-flush-percpu-vmevents-before-releasing-memcg.patch mm-memcontrol-flush-percpu-vmstats-before-releasing-memcg.patch mm-page_alloc-move_freepages-should-not-examine-struct-page-of-reserved-memory.patch mm-page_owner-handle-thp-splits-correctly.patch --- diff --git a/queue-5.2/mm-memcontrol-flush-percpu-vmevents-before-releasing-memcg.patch b/queue-5.2/mm-memcontrol-flush-percpu-vmevents-before-releasing-memcg.patch new file mode 100644 index 00000000000..9ba2b52c35b --- /dev/null +++ b/queue-5.2/mm-memcontrol-flush-percpu-vmevents-before-releasing-memcg.patch @@ -0,0 +1,78 @@ +From bb65f89b7d3d305c14951f49860711fbcae70692 Mon Sep 17 00:00:00 2001 +From: Roman Gushchin +Date: Sat, 24 Aug 2019 17:54:50 -0700 +Subject: mm: memcontrol: flush percpu vmevents before releasing memcg + +From: Roman Gushchin + +commit bb65f89b7d3d305c14951f49860711fbcae70692 upstream. + +Similar to vmstats, percpu caching of local vmevents leads to an +accumulation of errors on non-leaf levels. This happens because some +leftovers may remain in percpu caches, so that they are never propagated +up by the cgroup tree and just disappear into nonexistence with on +releasing of the memory cgroup. + +To fix this issue let's accumulate and propagate percpu vmevents values +before releasing the memory cgroup similar to what we're doing with +vmstats. + +Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate +only over online cpus. + +Link: http://lkml.kernel.org/r/20190819202338.363363-4-guro@fb.com +Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty") +Signed-off-by: Roman Gushchin +Acked-by: Michal Hocko +Cc: Johannes Weiner +Cc: Vladimir Davydov +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 22 +++++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -3185,6 +3185,25 @@ static void memcg_flush_percpu_vmstats(s + } + } + ++static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) ++{ ++ unsigned long events[NR_VM_EVENT_ITEMS]; ++ struct mem_cgroup *mi; ++ int cpu, i; ++ ++ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) ++ events[i] = 0; ++ ++ for_each_online_cpu(cpu) ++ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) ++ events[i] += raw_cpu_read( ++ memcg->vmstats_percpu->events[i]); ++ ++ for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) ++ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) ++ atomic_long_add(events[i], &mi->vmevents[i]); ++} ++ + #ifdef CONFIG_MEMCG_KMEM + static int memcg_online_kmem(struct mem_cgroup *memcg) + { +@@ -4587,10 +4606,11 @@ static void __mem_cgroup_free(struct mem + int node; + + /* +- * Flush percpu vmstats to guarantee the value correctness ++ * Flush percpu vmstats and vmevents to guarantee the value correctness + * on parent's and all ancestor levels. + */ + memcg_flush_percpu_vmstats(memcg); ++ memcg_flush_percpu_vmevents(memcg); + for_each_node(node) + free_mem_cgroup_per_node_info(memcg, node); + free_percpu(memcg->vmstats_percpu); diff --git a/queue-5.2/mm-memcontrol-flush-percpu-vmstats-before-releasing-memcg.patch b/queue-5.2/mm-memcontrol-flush-percpu-vmstats-before-releasing-memcg.patch new file mode 100644 index 00000000000..6d5cd7f2cc2 --- /dev/null +++ b/queue-5.2/mm-memcontrol-flush-percpu-vmstats-before-releasing-memcg.patch @@ -0,0 +1,107 @@ +From c350a99ea2b1b666c28948d74ab46c16913c28a7 Mon Sep 17 00:00:00 2001 +From: Roman Gushchin +Date: Sat, 24 Aug 2019 17:54:47 -0700 +Subject: mm: memcontrol: flush percpu vmstats before releasing memcg + +From: Roman Gushchin + +commit c350a99ea2b1b666c28948d74ab46c16913c28a7 upstream. + +Percpu caching of local vmstats with the conditional propagation by the +cgroup tree leads to an accumulation of errors on non-leaf levels. + +Let's imagine two nested memory cgroups A and A/B. Say, a process +belonging to A/B allocates 100 pagecache pages on the CPU 0. The percpu +cache will spill 3 times, so that 32*3=96 pages will be accounted to A/B +and A atomic vmstat counters, 4 pages will remain in the percpu cache. + +Imagine A/B is nearby memory.max, so that every following allocation +triggers a direct reclaim on the local CPU. Say, each such attempt will +free 16 pages on a new cpu. That means every percpu cache will have -16 +pages, except the first one, which will have 4 - 16 = -12. A/B and A +atomic counters will not be touched at all. + +Now a user removes A/B. All percpu caches are freed and corresponding +vmstat numbers are forgotten. A has 96 pages more than expected. + +As memory cgroups are created and destroyed, errors do accumulate. Even +1-2 pages differences can accumulate into large numbers. + +To fix this issue let's accumulate and propagate percpu vmstat values +before releasing the memory cgroup. At this point these numbers are +stable and cannot be changed. + +Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate +only over online cpus. + +Link: http://lkml.kernel.org/r/20190819202338.363363-2-guro@fb.com +Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty") +Signed-off-by: Roman Gushchin +Acked-by: Michal Hocko +Cc: Johannes Weiner +Cc: Vladimir Davydov +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 40 ++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 40 insertions(+) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -3150,6 +3150,41 @@ static u64 mem_cgroup_read_u64(struct cg + } + } + ++static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) ++{ ++ unsigned long stat[MEMCG_NR_STAT]; ++ struct mem_cgroup *mi; ++ int node, cpu, i; ++ ++ for (i = 0; i < MEMCG_NR_STAT; i++) ++ stat[i] = 0; ++ ++ for_each_online_cpu(cpu) ++ for (i = 0; i < MEMCG_NR_STAT; i++) ++ stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]); ++ ++ for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) ++ for (i = 0; i < MEMCG_NR_STAT; i++) ++ atomic_long_add(stat[i], &mi->vmstats[i]); ++ ++ for_each_node(node) { ++ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; ++ struct mem_cgroup_per_node *pi; ++ ++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) ++ stat[i] = 0; ++ ++ for_each_online_cpu(cpu) ++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) ++ stat[i] += raw_cpu_read( ++ pn->lruvec_stat_cpu->count[i]); ++ ++ for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) ++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) ++ atomic_long_add(stat[i], &pi->lruvec_stat[i]); ++ } ++} ++ + #ifdef CONFIG_MEMCG_KMEM + static int memcg_online_kmem(struct mem_cgroup *memcg) + { +@@ -4551,6 +4586,11 @@ static void __mem_cgroup_free(struct mem + { + int node; + ++ /* ++ * Flush percpu vmstats to guarantee the value correctness ++ * on parent's and all ancestor levels. ++ */ ++ memcg_flush_percpu_vmstats(memcg); + for_each_node(node) + free_mem_cgroup_per_node_info(memcg, node); + free_percpu(memcg->vmstats_percpu); diff --git a/queue-5.2/mm-page_alloc-move_freepages-should-not-examine-struct-page-of-reserved-memory.patch b/queue-5.2/mm-page_alloc-move_freepages-should-not-examine-struct-page-of-reserved-memory.patch new file mode 100644 index 00000000000..41dc0deb7fe --- /dev/null +++ b/queue-5.2/mm-page_alloc-move_freepages-should-not-examine-struct-page-of-reserved-memory.patch @@ -0,0 +1,127 @@ +From cd961038381f392b364a7c4a040f4576ca415b1a Mon Sep 17 00:00:00 2001 +From: David Rientjes +Date: Sat, 24 Aug 2019 17:54:40 -0700 +Subject: mm, page_alloc: move_freepages should not examine struct page of reserved memory + +From: David Rientjes + +commit cd961038381f392b364a7c4a040f4576ca415b1a upstream. + +After commit 907ec5fca3dc ("mm: zero remaining unavailable struct +pages"), struct page of reserved memory is zeroed. This causes +page->flags to be 0 and fixes issues related to reading +/proc/kpageflags, for example, of reserved memory. + +The VM_BUG_ON() in move_freepages_block(), however, assumes that +page_zone() is meaningful even for reserved memory. That assumption is +no longer true after the aforementioned commit. + +There's no reason why move_freepages_block() should be testing the +legitimacy of page_zone() for reserved memory; its scope is limited only +to pages on the zone's freelist. + +Note that pfn_valid() can be true for reserved memory: there is a +backing struct page. The check for page_to_nid(page) is also buggy but +reserved memory normally only appears on node 0 so the zeroing doesn't +affect this. + +Move the debug checks to after verifying PageBuddy is true. This +isolates the scope of the checks to only be for buddy pages which are on +the zone's freelist which move_freepages_block() is operating on. In +this case, an incorrect node or zone is a bug worthy of being warned +about (and the examination of struct page is acceptable bcause this +memory is not reserved). + +Why does move_freepages_block() gets called on reserved memory? It's +simply math after finding a valid free page from the per-zone free area +to use as fallback. We find the beginning and end of the pageblock of +the valid page and that can bring us into memory that was reserved per +the e820. pfn_valid() is still true (it's backed by a struct page), but +since it's zero'd we shouldn't make any inferences here about comparing +its node or zone. The current node check just happens to succeed most +of the time by luck because reserved memory typically appears on node 0. + +The fix here is to validate that we actually have buddy pages before +testing if there's any type of zone or node strangeness going on. + +We noticed it almost immediately after bringing 907ec5fca3dc in on +CONFIG_DEBUG_VM builds. It depends on finding specific free pages in +the per-zone free area where the math in move_freepages() will bring the +start or end pfn into reserved memory and wanting to claim that entire +pageblock as a new migratetype. So the path will be rare, require +CONFIG_DEBUG_VM, and require fallback to a different migratetype. + +Some struct pages were already zeroed from reserve pages before +907ec5fca3c so it theoretically could trigger before this commit. I +think it's rare enough under a config option that most people don't run +that others may not have noticed. I wouldn't argue against a stable tag +and the backport should be easy enough, but probably wouldn't single out +a commit that this is fixing. + +Mel said: + +: The overhead of the debugging check is higher with this patch although +: it'll only affect debug builds and the path is not particularly hot. +: If this was a concern, I think it would be reasonable to simply remove +: the debugging check as the zone boundaries are checked in +: move_freepages_block and we never expect a zone/node to be smaller than +: a pageblock and stuck in the middle of another zone. + +Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1908122036560.10779@chino.kir.corp.google.com +Signed-off-by: David Rientjes +Acked-by: Mel Gorman +Cc: Naoya Horiguchi +Cc: Masayoshi Mizuma +Cc: Oscar Salvador +Cc: Pavel Tatashin +Cc: Vlastimil Babka +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 19 ++++--------------- + 1 file changed, 4 insertions(+), 15 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -2167,27 +2167,12 @@ static int move_freepages(struct zone *z + unsigned int order; + int pages_moved = 0; + +-#ifndef CONFIG_HOLES_IN_ZONE +- /* +- * page_zone is not safe to call in this context when +- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant +- * anyway as we check zone boundaries in move_freepages_block(). +- * Remove at a later date when no bug reports exist related to +- * grouping pages by mobility +- */ +- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && +- pfn_valid(page_to_pfn(end_page)) && +- page_zone(start_page) != page_zone(end_page)); +-#endif + for (page = start_page; page <= end_page;) { + if (!pfn_valid_within(page_to_pfn(page))) { + page++; + continue; + } + +- /* Make sure we are not inadvertently changing nodes */ +- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); +- + if (!PageBuddy(page)) { + /* + * We assume that pages that could be isolated for +@@ -2202,6 +2187,10 @@ static int move_freepages(struct zone *z + continue; + } + ++ /* Make sure we are not inadvertently changing nodes */ ++ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); ++ VM_BUG_ON_PAGE(page_zone(page) != zone, page); ++ + order = page_order(page); + move_to_free_area(page, &zone->free_area[order], migratetype); + page += 1 << order; diff --git a/queue-5.2/mm-page_owner-handle-thp-splits-correctly.patch b/queue-5.2/mm-page_owner-handle-thp-splits-correctly.patch new file mode 100644 index 00000000000..789c1008067 --- /dev/null +++ b/queue-5.2/mm-page_owner-handle-thp-splits-correctly.patch @@ -0,0 +1,54 @@ +From f7da677bc6e72033f0981b9d58b5c5d409fa641e Mon Sep 17 00:00:00 2001 +From: Vlastimil Babka +Date: Sat, 24 Aug 2019 17:54:59 -0700 +Subject: mm, page_owner: handle THP splits correctly + +From: Vlastimil Babka + +commit f7da677bc6e72033f0981b9d58b5c5d409fa641e upstream. + +THP splitting path is missing the split_page_owner() call that +split_page() has. + +As a result, split THP pages are wrongly reported in the page_owner file +as order-9 pages. Furthermore when the former head page is freed, the +remaining former tail pages are not listed in the page_owner file at +all. This patch fixes that by adding the split_page_owner() call into +__split_huge_page(). + +Link: http://lkml.kernel.org/r/20190820131828.22684-2-vbabka@suse.cz +Fixes: a9627bc5e34e ("mm/page_owner: introduce split_page_owner and replace manual handling") +Reported-by: Kirill A. Shutemov +Signed-off-by: Vlastimil Babka +Cc: Michal Hocko +Cc: Mel Gorman +Cc: Matthew Wilcox +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/huge_memory.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -2500,6 +2501,9 @@ static void __split_huge_page(struct pag + } + + ClearPageCompound(head); ++ ++ split_page_owner(head, HPAGE_PMD_ORDER); ++ + /* See comment in __split_huge_page_tail() */ + if (PageAnon(head)) { + /* Additional pin to swap cache */ diff --git a/queue-5.2/series b/queue-5.2/series index d7f5edbc535..7410bf86b55 100644 --- a/queue-5.2/series +++ b/queue-5.2/series @@ -144,3 +144,7 @@ dm-zoned-improve-error-handling-in-i-o-map-code.patch dm-zoned-properly-handle-backing-device-failure.patch genirq-properly-pair-kobject_del-with-kobject_add.patch mm-z3fold.c-fix-race-between-migration-and-destruction.patch +mm-page_alloc-move_freepages-should-not-examine-struct-page-of-reserved-memory.patch +mm-memcontrol-flush-percpu-vmstats-before-releasing-memcg.patch +mm-memcontrol-flush-percpu-vmevents-before-releasing-memcg.patch +mm-page_owner-handle-thp-splits-correctly.patch