--- /dev/null
+From 5abfd71d936a8aefd9f9ccd299dea7a164a5d455 Mon Sep 17 00:00:00 2001
+From: Peter Xu <peterx@redhat.com>
+Date: Tue, 22 Mar 2022 14:42:15 -0700
+Subject: mm: don't skip swap entry even if zap_details specified
+
+From: Peter Xu <peterx@redhat.com>
+
+commit 5abfd71d936a8aefd9f9ccd299dea7a164a5d455 upstream.
+
+Patch series "mm: Rework zap ptes on swap entries", v5.
+
+Patch 1 should fix a long standing bug for zap_pte_range() on
+zap_details usage. The risk is we could have some swap entries skipped
+while we should have zapped them.
+
+Migration entries are not the major concern because file backed memory
+always zap in the pattern that "first time without page lock, then
+re-zap with page lock" hence the 2nd zap will always make sure all
+migration entries are already recovered.
+
+However there can be issues with real swap entries got skipped
+errornoously. There's a reproducer provided in commit message of patch
+1 for that.
+
+Patch 2-4 are cleanups that are based on patch 1. After the whole
+patchset applied, we should have a very clean view of zap_pte_range().
+
+Only patch 1 needs to be backported to stable if necessary.
+
+This patch (of 4):
+
+The "details" pointer shouldn't be the token to decide whether we should
+skip swap entries.
+
+For example, when the callers specified details->zap_mapping==NULL, it
+means the user wants to zap all the pages (including COWed pages), then
+we need to look into swap entries because there can be private COWed
+pages that was swapped out.
+
+Skipping some swap entries when details is non-NULL may lead to wrongly
+leaving some of the swap entries while we should have zapped them.
+
+A reproducer of the problem:
+
+===8<===
+ #define _GNU_SOURCE /* See feature_test_macros(7) */
+ #include <stdio.h>
+ #include <assert.h>
+ #include <unistd.h>
+ #include <sys/mman.h>
+ #include <sys/types.h>
+
+ int page_size;
+ int shmem_fd;
+ char *buffer;
+
+ void main(void)
+ {
+ int ret;
+ char val;
+
+ page_size = getpagesize();
+ shmem_fd = memfd_create("test", 0);
+ assert(shmem_fd >= 0);
+
+ ret = ftruncate(shmem_fd, page_size * 2);
+ assert(ret == 0);
+
+ buffer = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE, shmem_fd, 0);
+ assert(buffer != MAP_FAILED);
+
+ /* Write private page, swap it out */
+ buffer[page_size] = 1;
+ madvise(buffer, page_size * 2, MADV_PAGEOUT);
+
+ /* This should drop private buffer[page_size] already */
+ ret = ftruncate(shmem_fd, page_size);
+ assert(ret == 0);
+ /* Recover the size */
+ ret = ftruncate(shmem_fd, page_size * 2);
+ assert(ret == 0);
+
+ /* Re-read the data, it should be all zero */
+ val = buffer[page_size];
+ if (val == 0)
+ printf("Good\n");
+ else
+ printf("BUG\n");
+ }
+===8<===
+
+We don't need to touch up the pmd path, because pmd never had a issue with
+swap entries. For example, shmem pmd migration will always be split into
+pte level, and same to swapping on anonymous.
+
+Add another helper should_zap_cows() so that we can also check whether we
+should zap private mappings when there's no page pointer specified.
+
+This patch drops that trick, so we handle swap ptes coherently. Meanwhile
+we should do the same check upon migration entry, hwpoison entry and
+genuine swap entries too.
+
+To be explicit, we should still remember to keep the private entries if
+even_cows==false, and always zap them when even_cows==true.
+
+The issue seems to exist starting from the initial commit of git.
+
+[peterx@redhat.com: comment tweaks]
+ Link: https://lkml.kernel.org/r/20220217060746.71256-2-peterx@redhat.com
+
+Link: https://lkml.kernel.org/r/20220217060746.71256-1-peterx@redhat.com
+Link: https://lkml.kernel.org/r/20220216094810.60572-1-peterx@redhat.com
+Link: https://lkml.kernel.org/r/20220216094810.60572-2-peterx@redhat.com
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Signed-off-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: John Hubbard <jhubbard@nvidia.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c | 25 +++++++++++++++++++------
+ 1 file changed, 19 insertions(+), 6 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1301,6 +1301,17 @@ copy_page_range(struct vm_area_struct *d
+ return ret;
+ }
+
++/* Whether we should zap all COWed (private) pages too */
++static inline bool should_zap_cows(struct zap_details *details)
++{
++ /* By default, zap all pages */
++ if (!details)
++ return true;
++
++ /* Or, we zap COWed pages only if the caller wants to */
++ return !details->check_mapping;
++}
++
+ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+@@ -1396,16 +1407,18 @@ again:
+ continue;
+ }
+
+- /* If details->check_mapping, we leave swap entries. */
+- if (unlikely(details))
+- continue;
+-
+- if (!non_swap_entry(entry))
++ if (!non_swap_entry(entry)) {
++ /* Genuine swap entry, hence a private anon page */
++ if (!should_zap_cows(details))
++ continue;
+ rss[MM_SWAPENTS]--;
+- else if (is_migration_entry(entry)) {
++ } else if (is_migration_entry(entry)) {
+ struct page *page;
+
+ page = pfn_swap_entry_to_page(entry);
++ if (details && details->check_mapping &&
++ details->check_mapping != page_rmapping(page))
++ continue;
+ rss[mm_counter(page)]--;
+ }
+ if (unlikely(!free_swap_and_cache(entry)))
--- /dev/null
+From b09c2baa56347ae65795350dfcc633dedb1c2970 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Thu, 6 Jan 2022 11:02:29 -1000
+Subject: selftests: cgroup: Make cg_create() use 0755 for permission instead of 0644
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Tejun Heo <tj@kernel.org>
+
+commit b09c2baa56347ae65795350dfcc633dedb1c2970 upstream.
+
+0644 is an odd perm to create a cgroup which is a directory. Use the regular
+0755 instead. This is necessary for euid switching test case.
+
+Reviewed-by: Michal Koutný <mkoutny@suse.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/cgroup/cgroup_util.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/testing/selftests/cgroup/cgroup_util.c
++++ b/tools/testing/selftests/cgroup/cgroup_util.c
+@@ -221,7 +221,7 @@ int cg_find_unified_root(char *root, siz
+
+ int cg_create(const char *cgroup)
+ {
+- return mkdir(cgroup, 0644);
++ return mkdir(cgroup, 0755);
+ }
+
+ int cg_wait_for_proc_count(const char *cgroup, int count)
--- /dev/null
+From bf35a7879f1dfb0d050fe779168bcf25c7de66f5 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Thu, 6 Jan 2022 11:02:29 -1000
+Subject: selftests: cgroup: Test open-time cgroup namespace usage for migration checks
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Tejun Heo <tj@kernel.org>
+
+commit bf35a7879f1dfb0d050fe779168bcf25c7de66f5 upstream.
+
+When a task is writing to an fd opened by a different task, the perm check
+should use the cgroup namespace of the latter task. Add a test for it.
+
+Tested-by: Michal Koutný <mkoutny@suse.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/cgroup/test_core.c | 97 +++++++++++++++++++++++++++++
+ 1 file changed, 97 insertions(+)
+
+--- a/tools/testing/selftests/cgroup/test_core.c
++++ b/tools/testing/selftests/cgroup/test_core.c
+@@ -1,11 +1,14 @@
+ /* SPDX-License-Identifier: GPL-2.0 */
+
++#define _GNU_SOURCE
+ #include <linux/limits.h>
++#include <linux/sched.h>
+ #include <sys/types.h>
+ #include <sys/mman.h>
+ #include <sys/wait.h>
+ #include <unistd.h>
+ #include <fcntl.h>
++#include <sched.h>
+ #include <stdio.h>
+ #include <errno.h>
+ #include <signal.h>
+@@ -741,6 +744,99 @@ cleanup:
+ return ret;
+ }
+
++struct lesser_ns_open_thread_arg {
++ const char *path;
++ int fd;
++ int err;
++};
++
++static int lesser_ns_open_thread_fn(void *arg)
++{
++ struct lesser_ns_open_thread_arg *targ = arg;
++
++ targ->fd = open(targ->path, O_RDWR);
++ targ->err = errno;
++ return 0;
++}
++
++/*
++ * cgroup migration permission check should be performed based on the cgroup
++ * namespace at the time of open instead of write.
++ */
++static int test_cgcore_lesser_ns_open(const char *root)
++{
++ static char stack[65536];
++ const uid_t test_euid = 65534; /* usually nobody, any !root is fine */
++ int ret = KSFT_FAIL;
++ char *cg_test_a = NULL, *cg_test_b = NULL;
++ char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
++ int cg_test_b_procs_fd = -1;
++ struct lesser_ns_open_thread_arg targ = { .fd = -1 };
++ pid_t pid;
++ int status;
++
++ cg_test_a = cg_name(root, "cg_test_a");
++ cg_test_b = cg_name(root, "cg_test_b");
++
++ if (!cg_test_a || !cg_test_b)
++ goto cleanup;
++
++ cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
++ cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
++
++ if (!cg_test_a_procs || !cg_test_b_procs)
++ goto cleanup;
++
++ if (cg_create(cg_test_a) || cg_create(cg_test_b))
++ goto cleanup;
++
++ if (cg_enter_current(cg_test_b))
++ goto cleanup;
++
++ if (chown(cg_test_a_procs, test_euid, -1) ||
++ chown(cg_test_b_procs, test_euid, -1))
++ goto cleanup;
++
++ targ.path = cg_test_b_procs;
++ pid = clone(lesser_ns_open_thread_fn, stack + sizeof(stack),
++ CLONE_NEWCGROUP | CLONE_FILES | CLONE_VM | SIGCHLD,
++ &targ);
++ if (pid < 0)
++ goto cleanup;
++
++ if (waitpid(pid, &status, 0) < 0)
++ goto cleanup;
++
++ if (!WIFEXITED(status))
++ goto cleanup;
++
++ cg_test_b_procs_fd = targ.fd;
++ if (cg_test_b_procs_fd < 0)
++ goto cleanup;
++
++ if (cg_enter_current(cg_test_a))
++ goto cleanup;
++
++ if ((status = write(cg_test_b_procs_fd, "0", 1)) >= 0 || errno != ENOENT)
++ goto cleanup;
++
++ ret = KSFT_PASS;
++
++cleanup:
++ cg_enter_current(root);
++ if (cg_test_b_procs_fd >= 0)
++ close(cg_test_b_procs_fd);
++ if (cg_test_b)
++ cg_destroy(cg_test_b);
++ if (cg_test_a)
++ cg_destroy(cg_test_a);
++ free(cg_test_b_procs);
++ free(cg_test_a_procs);
++ free(cg_test_b);
++ free(cg_test_a);
++ return ret;
++}
++
+ #define T(x) { x, #x }
+ struct corecg_test {
+ int (*fn)(const char *root);
+@@ -757,6 +853,7 @@ struct corecg_test {
+ T(test_cgcore_thread_migration),
+ T(test_cgcore_destroy),
+ T(test_cgcore_lesser_euid_open),
++ T(test_cgcore_lesser_ns_open),
+ };
+ #undef T
+
--- /dev/null
+From 613e040e4dc285367bff0f8f75ea59839bc10947 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Thu, 6 Jan 2022 11:02:29 -1000
+Subject: selftests: cgroup: Test open-time credential usage for migration checks
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 613e040e4dc285367bff0f8f75ea59839bc10947 upstream.
+
+When a task is writing to an fd opened by a different task, the perm check
+should use the credentials of the latter task. Add a test for it.
+
+Tested-by: Michal Koutný <mkoutny@suse.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/cgroup/test_core.c | 68 +++++++++++++++++++++++++++++
+ 1 file changed, 68 insertions(+)
+
+--- a/tools/testing/selftests/cgroup/test_core.c
++++ b/tools/testing/selftests/cgroup/test_core.c
+@@ -674,6 +674,73 @@ cleanup:
+ return ret;
+ }
+
++/*
++ * cgroup migration permission check should be performed based on the
++ * credentials at the time of open instead of write.
++ */
++static int test_cgcore_lesser_euid_open(const char *root)
++{
++ const uid_t test_euid = 65534; /* usually nobody, any !root is fine */
++ int ret = KSFT_FAIL;
++ char *cg_test_a = NULL, *cg_test_b = NULL;
++ char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
++ int cg_test_b_procs_fd = -1;
++ uid_t saved_uid;
++
++ cg_test_a = cg_name(root, "cg_test_a");
++ cg_test_b = cg_name(root, "cg_test_b");
++
++ if (!cg_test_a || !cg_test_b)
++ goto cleanup;
++
++ cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
++ cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
++
++ if (!cg_test_a_procs || !cg_test_b_procs)
++ goto cleanup;
++
++ if (cg_create(cg_test_a) || cg_create(cg_test_b))
++ goto cleanup;
++
++ if (cg_enter_current(cg_test_a))
++ goto cleanup;
++
++ if (chown(cg_test_a_procs, test_euid, -1) ||
++ chown(cg_test_b_procs, test_euid, -1))
++ goto cleanup;
++
++ saved_uid = geteuid();
++ if (seteuid(test_euid))
++ goto cleanup;
++
++ cg_test_b_procs_fd = open(cg_test_b_procs, O_RDWR);
++
++ if (seteuid(saved_uid))
++ goto cleanup;
++
++ if (cg_test_b_procs_fd < 0)
++ goto cleanup;
++
++ if (write(cg_test_b_procs_fd, "0", 1) >= 0 || errno != EACCES)
++ goto cleanup;
++
++ ret = KSFT_PASS;
++
++cleanup:
++ cg_enter_current(root);
++ if (cg_test_b_procs_fd >= 0)
++ close(cg_test_b_procs_fd);
++ if (cg_test_b)
++ cg_destroy(cg_test_b);
++ if (cg_test_a)
++ cg_destroy(cg_test_a);
++ free(cg_test_b_procs);
++ free(cg_test_a_procs);
++ free(cg_test_b);
++ free(cg_test_a);
++ return ret;
++}
++
+ #define T(x) { x, #x }
+ struct corecg_test {
+ int (*fn)(const char *root);
+@@ -689,6 +756,7 @@ struct corecg_test {
+ T(test_cgcore_proc_migration),
+ T(test_cgcore_thread_migration),
+ T(test_cgcore_destroy),
++ T(test_cgcore_lesser_euid_open),
+ };
+ #undef T
+
kvm-avoid-null-pointer-dereference-in-kvm_dirty_ring_push.patch
revert-net-mlx5-accept-devlink-user-input-after-driver-initialization-complete.patch
ubsan-remove-config_ubsan_object_size.patch
+selftests-cgroup-make-cg_create-use-0755-for-permission-instead-of-0644.patch
+selftests-cgroup-test-open-time-credential-usage-for-migration-checks.patch
+selftests-cgroup-test-open-time-cgroup-namespace-usage-for-migration-checks.patch
+mm-don-t-skip-swap-entry-even-if-zap_details-specified.patch