+++ /dev/null
-From foo@baz Tue Oct 16 11:10:21 CEST 2018
-From: Jay Kamat <jgkamat@fb.com>
-Date: Fri, 7 Sep 2018 14:34:05 -0700
-Subject: Add tests for memory.oom.group
-
-From: Jay Kamat <jgkamat@fb.com>
-
-[ Upstream commit a987785dcd6c8ae2915460582aebd6481c81eb67 ]
-
-Add tests for memory.oom.group for the following cases:
-- Killing all processes in a leaf cgroup, but leaving the
- parent untouched
-- Killing all processes in a parent and leaf cgroup
-- Keeping processes marked by OOM_SCORE_ADJ_MIN alive when considered
- for being killed by the group oom killer.
-
-Signed-off-by: Jay Kamat <jgkamat@fb.com>
-Acked-by: Roman Gushchin <guro@fb.com>
-Signed-off-by: Shuah Khan (Samsung OSG) <shuah@kernel.org>
-Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- tools/testing/selftests/cgroup/cgroup_util.c | 21 ++
- tools/testing/selftests/cgroup/cgroup_util.h | 1
- tools/testing/selftests/cgroup/test_memcontrol.c | 205 +++++++++++++++++++++++
- 3 files changed, 227 insertions(+)
-
---- a/tools/testing/selftests/cgroup/cgroup_util.c
-+++ b/tools/testing/selftests/cgroup/cgroup_util.c
-@@ -340,3 +340,24 @@ int is_swap_enabled(void)
-
- return cnt > 1;
- }
-+
-+int set_oom_adj_score(int pid, int score)
-+{
-+ char path[PATH_MAX];
-+ int fd, len;
-+
-+ sprintf(path, "/proc/%d/oom_score_adj", pid);
-+
-+ fd = open(path, O_WRONLY | O_APPEND);
-+ if (fd < 0)
-+ return fd;
-+
-+ len = dprintf(fd, "%d", score);
-+ if (len < 0) {
-+ close(fd);
-+ return len;
-+ }
-+
-+ close(fd);
-+ return 0;
-+}
---- a/tools/testing/selftests/cgroup/cgroup_util.h
-+++ b/tools/testing/selftests/cgroup/cgroup_util.h
-@@ -39,3 +39,4 @@ extern int get_temp_fd(void);
- extern int alloc_pagecache(int fd, size_t size);
- extern int alloc_anon(const char *cgroup, void *arg);
- extern int is_swap_enabled(void);
-+extern int set_oom_adj_score(int pid, int score);
---- a/tools/testing/selftests/cgroup/test_memcontrol.c
-+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
-@@ -2,6 +2,7 @@
- #define _GNU_SOURCE
-
- #include <linux/limits.h>
-+#include <linux/oom.h>
- #include <fcntl.h>
- #include <stdio.h>
- #include <stdlib.h>
-@@ -202,6 +203,36 @@ static int alloc_pagecache_50M_noexit(co
- return 0;
- }
-
-+static int alloc_anon_noexit(const char *cgroup, void *arg)
-+{
-+ int ppid = getppid();
-+
-+ if (alloc_anon(cgroup, arg))
-+ return -1;
-+
-+ while (getppid() == ppid)
-+ sleep(1);
-+
-+ return 0;
-+}
-+
-+/*
-+ * Wait until processes are killed asynchronously by the OOM killer
-+ * If we exceed a timeout, fail.
-+ */
-+static int cg_test_proc_killed(const char *cgroup)
-+{
-+ int limit;
-+
-+ for (limit = 10; limit > 0; limit--) {
-+ if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
-+ return 0;
-+
-+ usleep(100000);
-+ }
-+ return -1;
-+}
-+
- /*
- * First, this test creates the following hierarchy:
- * A memory.min = 50M, memory.max = 200M
-@@ -964,6 +995,177 @@ cleanup:
- return ret;
- }
-
-+/*
-+ * This test disables swapping and tries to allocate anonymous memory
-+ * up to OOM with memory.group.oom set. Then it checks that all
-+ * processes in the leaf (but not the parent) were killed.
-+ */
-+static int test_memcg_oom_group_leaf_events(const char *root)
-+{
-+ int ret = KSFT_FAIL;
-+ char *parent, *child;
-+
-+ parent = cg_name(root, "memcg_test_0");
-+ child = cg_name(root, "memcg_test_0/memcg_test_1");
-+
-+ if (!parent || !child)
-+ goto cleanup;
-+
-+ if (cg_create(parent))
-+ goto cleanup;
-+
-+ if (cg_create(child))
-+ goto cleanup;
-+
-+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
-+ goto cleanup;
-+
-+ if (cg_write(child, "memory.max", "50M"))
-+ goto cleanup;
-+
-+ if (cg_write(child, "memory.swap.max", "0"))
-+ goto cleanup;
-+
-+ if (cg_write(child, "memory.oom.group", "1"))
-+ goto cleanup;
-+
-+ cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
-+ cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
-+ cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
-+ if (!cg_run(child, alloc_anon, (void *)MB(100)))
-+ goto cleanup;
-+
-+ if (cg_test_proc_killed(child))
-+ goto cleanup;
-+
-+ if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
-+ goto cleanup;
-+
-+ if (cg_read_key_long(parent, "memory.events", "oom_kill ") != 0)
-+ goto cleanup;
-+
-+ ret = KSFT_PASS;
-+
-+cleanup:
-+ if (child)
-+ cg_destroy(child);
-+ if (parent)
-+ cg_destroy(parent);
-+ free(child);
-+ free(parent);
-+
-+ return ret;
-+}
-+
-+/*
-+ * This test disables swapping and tries to allocate anonymous memory
-+ * up to OOM with memory.group.oom set. Then it checks that all
-+ * processes in the parent and leaf were killed.
-+ */
-+static int test_memcg_oom_group_parent_events(const char *root)
-+{
-+ int ret = KSFT_FAIL;
-+ char *parent, *child;
-+
-+ parent = cg_name(root, "memcg_test_0");
-+ child = cg_name(root, "memcg_test_0/memcg_test_1");
-+
-+ if (!parent || !child)
-+ goto cleanup;
-+
-+ if (cg_create(parent))
-+ goto cleanup;
-+
-+ if (cg_create(child))
-+ goto cleanup;
-+
-+ if (cg_write(parent, "memory.max", "80M"))
-+ goto cleanup;
-+
-+ if (cg_write(parent, "memory.swap.max", "0"))
-+ goto cleanup;
-+
-+ if (cg_write(parent, "memory.oom.group", "1"))
-+ goto cleanup;
-+
-+ cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
-+ cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
-+ cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
-+
-+ if (!cg_run(child, alloc_anon, (void *)MB(100)))
-+ goto cleanup;
-+
-+ if (cg_test_proc_killed(child))
-+ goto cleanup;
-+ if (cg_test_proc_killed(parent))
-+ goto cleanup;
-+
-+ ret = KSFT_PASS;
-+
-+cleanup:
-+ if (child)
-+ cg_destroy(child);
-+ if (parent)
-+ cg_destroy(parent);
-+ free(child);
-+ free(parent);
-+
-+ return ret;
-+}
-+
-+/*
-+ * This test disables swapping and tries to allocate anonymous memory
-+ * up to OOM with memory.group.oom set. Then it checks that all
-+ * processes were killed except those set with OOM_SCORE_ADJ_MIN
-+ */
-+static int test_memcg_oom_group_score_events(const char *root)
-+{
-+ int ret = KSFT_FAIL;
-+ char *memcg;
-+ int safe_pid;
-+
-+ memcg = cg_name(root, "memcg_test_0");
-+
-+ if (!memcg)
-+ goto cleanup;
-+
-+ if (cg_create(memcg))
-+ goto cleanup;
-+
-+ if (cg_write(memcg, "memory.max", "50M"))
-+ goto cleanup;
-+
-+ if (cg_write(memcg, "memory.swap.max", "0"))
-+ goto cleanup;
-+
-+ if (cg_write(memcg, "memory.oom.group", "1"))
-+ goto cleanup;
-+
-+ safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
-+ if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
-+ goto cleanup;
-+
-+ cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
-+ if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
-+ goto cleanup;
-+
-+ if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
-+ goto cleanup;
-+
-+ if (kill(safe_pid, SIGKILL))
-+ goto cleanup;
-+
-+ ret = KSFT_PASS;
-+
-+cleanup:
-+ if (memcg)
-+ cg_destroy(memcg);
-+ free(memcg);
-+
-+ return ret;
-+}
-+
-+
- #define T(x) { x, #x }
- struct memcg_test {
- int (*fn)(const char *root);
-@@ -978,6 +1180,9 @@ struct memcg_test {
- T(test_memcg_oom_events),
- T(test_memcg_swap_max),
- T(test_memcg_sock),
-+ T(test_memcg_oom_group_leaf_events),
-+ T(test_memcg_oom_group_parent_events),
-+ T(test_memcg_oom_group_score_events),
- };
- #undef T
-
--- /dev/null
+From 6579804c431712d56956a63b1a01509441cc6800 Mon Sep 17 00:00:00 2001
+From: Paul Mackerras <paulus@ozlabs.org>
+Date: Thu, 4 Oct 2018 14:51:11 +1000
+Subject: KVM: PPC: Book3S HV: Avoid crash from THP collapse during radix page fault
+
+From: Paul Mackerras <paulus@ozlabs.org>
+
+commit 6579804c431712d56956a63b1a01509441cc6800 upstream.
+
+Commit 71d29f43b633 ("KVM: PPC: Book3S HV: Don't use compound_order to
+determine host mapping size", 2018-09-11) added a call to
+__find_linux_pte() and a dereference of the returned PTE pointer to the
+radix page fault path in the common case where the page is normal
+system memory. Previously, __find_linux_pte() was only called for
+mappings to physical addresses which don't have a page struct (e.g.
+memory-mapped I/O) or where the page struct is marked as reserved
+memory.
+
+This exposes us to the possibility that the returned PTE pointer
+could be NULL, for example in the case of a concurrent THP collapse
+operation. Dereferencing the returned NULL pointer causes a host
+crash.
+
+To fix this, we check for NULL, and if it is NULL, we retry the
+operation by returning to the guest, with the expectation that it
+will generate the same page fault again (unless of course it has
+been fixed up by another CPU in the meantime).
+
+Fixes: 71d29f43b633 ("KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size")
+Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/powerpc/kvm/book3s_64_mmu_radix.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
++++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
+@@ -659,6 +659,16 @@ int kvmppc_book3s_radix_page_fault(struc
+ */
+ local_irq_disable();
+ ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
++ /*
++ * If the PTE disappeared temporarily due to a THP
++ * collapse, just return and let the guest try again.
++ */
++ if (!ptep) {
++ local_irq_enable();
++ if (page)
++ put_page(page);
++ return RESUME_GUEST;
++ }
+ pte = *ptep;
+ local_irq_enable();
+