--- /dev/null
+From e914d8f00391520ecc4495dd0ca0124538ab7119 Mon Sep 17 00:00:00 2001
+From: Minchan Kim <minchan@kernel.org>
+Date: Thu, 14 Apr 2022 19:13:46 -0700
+Subject: mm: fix unexpected zeroed page mapping with zram swap
+
+From: Minchan Kim <minchan@kernel.org>
+
+commit e914d8f00391520ecc4495dd0ca0124538ab7119 upstream.
+
+Two processes under CLONE_VM cloning, user process can be corrupted by
+seeing zeroed page unexpectedly.
+
+ CPU A CPU B
+
+ do_swap_page do_swap_page
+ SWP_SYNCHRONOUS_IO path SWP_SYNCHRONOUS_IO path
+ swap_readpage valid data
+ swap_slot_free_notify
+ delete zram entry
+ swap_readpage zeroed(invalid) data
+ pte_lock
+ map the *zero data* to userspace
+ pte_unlock
+ pte_lock
+ if (!pte_same)
+ goto out_nomap;
+ pte_unlock
+ return and next refault will
+ read zeroed data
+
+The swap_slot_free_notify is bogus for CLONE_VM case since it doesn't
+increase the refcount of swap slot at copy_mm so it couldn't catch up
+whether it's safe or not to discard data from backing device. In the
+case, only the lock it could rely on to synchronize swap slot freeing is
+page table lock. Thus, this patch gets rid of the swap_slot_free_notify
+function. With this patch, CPU A will see correct data.
+
+ CPU A CPU B
+
+ do_swap_page do_swap_page
+ SWP_SYNCHRONOUS_IO path SWP_SYNCHRONOUS_IO path
+ swap_readpage original data
+ pte_lock
+ map the original data
+ swap_free
+ swap_range_free
+ bd_disk->fops->swap_slot_free_notify
+ swap_readpage read zeroed data
+ pte_unlock
+ pte_lock
+ if (!pte_same)
+ goto out_nomap;
+ pte_unlock
+ return
+ on next refault will see mapped data by CPU B
+
+The concern of the patch would increase memory consumption since it
+could keep wasted memory with compressed form in zram as well as
+uncompressed form in address space. However, most of cases of zram uses
+no readahead and do_swap_page is followed by swap_free so it will free
+the compressed form from in zram quickly.
+
+Link: https://lkml.kernel.org/r/YjTVVxIAsnKAXjTd@google.com
+Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
+Reported-by: Ivan Babrou <ivan@cloudflare.com>
+Tested-by: Ivan Babrou <ivan@cloudflare.com>
+Signed-off-by: Minchan Kim <minchan@kernel.org>
+Cc: Nitin Gupta <ngupta@vflare.org>
+Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: <stable@vger.kernel.org> [4.14+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_io.c | 55 -------------------------------------------------------
+ 1 file changed, 55 deletions(-)
+
+--- a/mm/page_io.c
++++ b/mm/page_io.c
+@@ -71,55 +71,6 @@ void end_swap_bio_write(struct bio *bio)
+ bio_put(bio);
+ }
+
+-static void swap_slot_free_notify(struct page *page)
+-{
+- struct swap_info_struct *sis;
+- struct gendisk *disk;
+- swp_entry_t entry;
+-
+- /*
+- * There is no guarantee that the page is in swap cache - the software
+- * suspend code (at least) uses end_swap_bio_read() against a non-
+- * swapcache page. So we must check PG_swapcache before proceeding with
+- * this optimization.
+- */
+- if (unlikely(!PageSwapCache(page)))
+- return;
+-
+- sis = page_swap_info(page);
+- if (!(sis->flags & SWP_BLKDEV))
+- return;
+-
+- /*
+- * The swap subsystem performs lazy swap slot freeing,
+- * expecting that the page will be swapped out again.
+- * So we can avoid an unnecessary write if the page
+- * isn't redirtied.
+- * This is good for real swap storage because we can
+- * reduce unnecessary I/O and enhance wear-leveling
+- * if an SSD is used as the as swap device.
+- * But if in-memory swap device (eg zram) is used,
+- * this causes a duplicated copy between uncompressed
+- * data in VM-owned memory and compressed data in
+- * zram-owned memory. So let's free zram-owned memory
+- * and make the VM-owned decompressed page *dirty*,
+- * so the page should be swapped out somewhere again if
+- * we again wish to reclaim it.
+- */
+- disk = sis->bdev->bd_disk;
+- entry.val = page_private(page);
+- if (disk->fops->swap_slot_free_notify &&
+- __swap_count(sis, entry) == 1) {
+- unsigned long offset;
+-
+- offset = swp_offset(entry);
+-
+- SetPageDirty(page);
+- disk->fops->swap_slot_free_notify(sis->bdev,
+- offset);
+- }
+-}
+-
+ static void end_swap_bio_read(struct bio *bio)
+ {
+ struct page *page = bio_first_page_all(bio);
+@@ -135,7 +86,6 @@ static void end_swap_bio_read(struct bio
+ }
+
+ SetPageUptodate(page);
+- swap_slot_free_notify(page);
+ out:
+ unlock_page(page);
+ WRITE_ONCE(bio->bi_private, NULL);
+@@ -373,11 +323,6 @@ int swap_readpage(struct page *page, boo
+
+ ret = bdev_read_page(sis->bdev, map_swap_page(page, &sis->bdev), page);
+ if (!ret) {
+- if (trylock_page(page)) {
+- swap_slot_free_notify(page);
+- unlock_page(page);
+- }
+-
+ count_vm_event(PSWPIN);
+ return 0;
+ }