From: Greg Kroah-Hartman Date: Mon, 15 Apr 2019 14:40:27 +0000 (+0200) Subject: drop broken 4.14 patch X-Git-Tag: v4.9.169~15 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f3f179ab36d9901c2b206086fbe5b129bd308237;p=thirdparty%2Fkernel%2Fstable-queue.git drop broken 4.14 patch --- diff --git a/queue-4.14/mm-writeback-use-exact-memcg-dirty-counts.patch b/queue-4.14/mm-writeback-use-exact-memcg-dirty-counts.patch deleted file mode 100644 index c647b461f32..00000000000 --- a/queue-4.14/mm-writeback-use-exact-memcg-dirty-counts.patch +++ /dev/null @@ -1,223 +0,0 @@ -From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001 -From: Greg Thelen -Date: Fri, 5 Apr 2019 18:39:18 -0700 -Subject: mm: writeback: use exact memcg dirty counts - -From: Greg Thelen - -commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream. - -Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in -memory.stat reporting") memcg dirty and writeback counters are managed -as: - - 1) per-memcg per-cpu values in range of [-32..32] - - 2) per-memcg atomic counter - -When a per-cpu counter cannot fit in [-32..32] it's flushed to the -atomic. Stat readers only check the atomic. Thus readers such as -balance_dirty_pages() may see a nontrivial error margin: 32 pages per -cpu. - -Assuming 100 cpus: - 4k x86 page_size: 13 MiB error per memcg - 64k ppc page_size: 200 MiB error per memcg - -Considering that dirty+writeback are used together for some decisions the -errors double. - -This inaccuracy can lead to undeserved oom kills. One nasty case is -when all per-cpu counters hold positive values offsetting an atomic -negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32). -balance_dirty_pages() only consults the atomic and does not consider -throttling the next n_cpu*32 dirty pages. If the file_lru is in the -13..200 MiB range then there's absolutely no dirty throttling, which -burdens vmscan with only dirty+writeback pages thus resorting to oom -kill. - -It could be argued that tiny containers are not supported, but it's more -subtle. It's the amount the space available for file lru that matters. -If a container has memory.max-200MiB of non reclaimable memory, then it -will also suffer such oom kills on a 100 cpu machine. - -The following test reliably ooms without this patch. This patch avoids -oom kills. - - $ cat test - mount -t cgroup2 none /dev/cgroup - cd /dev/cgroup - echo +io +memory > cgroup.subtree_control - mkdir test - cd test - echo 10M > memory.max - (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo) - (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100) - - $ cat memcg-writeback-stress.c - /* - * Dirty pages from all but one cpu. - * Clean pages from the non dirtying cpu. - * This is to stress per cpu counter imbalance. - * On a 100 cpu machine: - * - per memcg per cpu dirty count is 32 pages for each of 99 cpus - * - per memcg atomic is -99*32 pages - * - thus the complete dirty limit: sum of all counters 0 - * - balance_dirty_pages() only sees atomic count -99*32 pages, which - * it max()s to 0. - * - So a workload can dirty -99*32 pages before balance_dirty_pages() - * cares. - */ - #define _GNU_SOURCE - #include - #include - #include - #include - #include - #include - #include - #include - #include - - static char *buf; - static int bufSize; - - static void set_affinity(int cpu) - { - cpu_set_t affinity; - - CPU_ZERO(&affinity); - CPU_SET(cpu, &affinity); - if (sched_setaffinity(0, sizeof(affinity), &affinity)) - err(1, "sched_setaffinity"); - } - - static void dirty_on(int output_fd, int cpu) - { - int i, wrote; - - set_affinity(cpu); - for (i = 0; i < 32; i++) { - for (wrote = 0; wrote < bufSize; ) { - int ret = write(output_fd, buf+wrote, bufSize-wrote); - if (ret == -1) - err(1, "write"); - wrote += ret; - } - } - } - - int main(int argc, char **argv) - { - int cpu, flush_cpu = 1, output_fd; - const char *output; - - if (argc != 2) - errx(1, "usage: output_file"); - - output = argv[1]; - bufSize = getpagesize(); - buf = malloc(getpagesize()); - if (buf == NULL) - errx(1, "malloc failed"); - - output_fd = open(output, O_CREAT|O_RDWR); - if (output_fd == -1) - err(1, "open(%s)", output); - - for (cpu = 0; cpu < get_nprocs(); cpu++) { - if (cpu != flush_cpu) - dirty_on(output_fd, cpu); - } - - set_affinity(flush_cpu); - if (fsync(output_fd)) - err(1, "fsync(%s)", output); - if (close(output_fd)) - err(1, "close(%s)", output); - free(buf); - } - -Make balance_dirty_pages() and wb_over_bg_thresh() work harder to -collect exact per memcg counters. This avoids the aforementioned oom -kills. - -This does not affect the overhead of memory.stat, which still reads the -single atomic counter. - -Why not use percpu_counter? memcg already handles cpus going offline, so -no need for that overhead from percpu_counter. And the percpu_counter -spinlocks are more heavyweight than is required. - -It probably also makes sense to use exact dirty and writeback counters -in memcg oom reports. But that is saved for later. - -Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com -Signed-off-by: Greg Thelen -Reviewed-by: Roman Gushchin -Acked-by: Johannes Weiner -Cc: Michal Hocko -Cc: Vladimir Davydov -Cc: Tejun Heo -Cc: [4.16+] -Signed-off-by: Andrew Morton -Signed-off-by: Linus Torvalds -Signed-off-by: Greg Kroah-Hartman - ---- - include/linux/memcontrol.h | 5 ++++- - mm/memcontrol.c | 20 ++++++++++++++++++-- - 2 files changed, 22 insertions(+), 3 deletions(-) - ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -488,7 +488,10 @@ struct mem_cgroup *lock_page_memcg(struc - void __unlock_page_memcg(struct mem_cgroup *memcg); - void unlock_page_memcg(struct page *page); - --/* idx can be of type enum memcg_stat_item or node_stat_item */ -+/* -+ * idx can be of type enum memcg_stat_item or node_stat_item. -+ * Keep in sync with memcg_exact_page_state(). -+ */ - static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, - int idx) - { ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -3671,6 +3671,22 @@ struct wb_domain *mem_cgroup_wb_domain(s - return &memcg->cgwb_domain; - } - -+/* -+ * idx can be of type enum memcg_stat_item or node_stat_item. -+ * Keep in sync with memcg_exact_page(). -+ */ -+static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) -+{ -+ long x = atomic_long_read(&memcg->stat[idx]); -+ int cpu; -+ -+ for_each_online_cpu(cpu) -+ x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx]; -+ if (x < 0) -+ x = 0; -+ return x; -+} -+ - /** - * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg - * @wb: bdi_writeback in question -@@ -3696,10 +3712,10 @@ void mem_cgroup_wb_stats(struct bdi_writ - struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); - struct mem_cgroup *parent; - -- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); -+ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); - - /* this should eventually include NR_UNSTABLE_NFS */ -- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); -+ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); - *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | - (1 << LRU_ACTIVE_FILE)); - *pheadroom = PAGE_COUNTER_MAX; diff --git a/queue-4.14/series b/queue-4.14/series index 1dd3f280751..1bd0ab3e87c 100644 --- a/queue-4.14/series +++ b/queue-4.14/series @@ -42,7 +42,6 @@ parisc-regs_return_value-should-return-gpr28.patch alarmtimer-return-correct-remaining-time.patch drm-udl-add-a-release-method-and-delay-modeset-teardown.patch include-linux-bitrev.h-fix-constant-bitrev.patch -mm-writeback-use-exact-memcg-dirty-counts.patch asoc-fsl_esai-fix-channel-swap-issue-when-stream-starts.patch btrfs-do-not-allow-trimming-when-a-fs-is-mounted-with-the-nologreplay-option.patch btrfs-prop-fix-zstd-compression-parameter-validation.patch