]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 11 Oct 2020 08:47:39 +0000 (10:47 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 11 Oct 2020 08:47:39 +0000 (10:47 +0200)
added patches:
mm-khugepaged-fix-filemap-page_to_pgoff-page-offset.patch

queue-4.9/mm-khugepaged-fix-filemap-page_to_pgoff-page-offset.patch [new file with mode: 0644]
queue-4.9/series

diff --git a/queue-4.9/mm-khugepaged-fix-filemap-page_to_pgoff-page-offset.patch b/queue-4.9/mm-khugepaged-fix-filemap-page_to_pgoff-page-offset.patch
new file mode 100644 (file)
index 0000000..0e17781
--- /dev/null
@@ -0,0 +1,106 @@
+From 033b5d77551167f8c24ca862ce83d3e0745f9245 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 9 Oct 2020 20:07:59 -0700
+Subject: mm/khugepaged: fix filemap page_to_pgoff(page) != offset
+
+From: Hugh Dickins <hughd@google.com>
+
+commit 033b5d77551167f8c24ca862ce83d3e0745f9245 upstream.
+
+There have been elusive reports of filemap_fault() hitting its
+VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page) on kernels built
+with CONFIG_READ_ONLY_THP_FOR_FS=y.
+
+Suren has hit it on a kernel with CONFIG_READ_ONLY_THP_FOR_FS=y and
+CONFIG_NUMA is not set: and he has analyzed it down to how khugepaged
+without NUMA reuses the same huge page after collapse_file() failed
+(whereas NUMA targets its allocation to the respective node each time).
+And most of us were usually testing with CONFIG_NUMA=y kernels.
+
+collapse_file(old start)
+  new_page = khugepaged_alloc_page(hpage)
+  __SetPageLocked(new_page)
+  new_page->index = start // hpage->index=old offset
+  new_page->mapping = mapping
+  xas_store(&xas, new_page)
+
+                          filemap_fault
+                            page = find_get_page(mapping, offset)
+                            // if offset falls inside hpage then
+                            // compound_head(page) == hpage
+                            lock_page_maybe_drop_mmap()
+                              __lock_page(page)
+
+  // collapse fails
+  xas_store(&xas, old page)
+  new_page->mapping = NULL
+  unlock_page(new_page)
+
+collapse_file(new start)
+  new_page = khugepaged_alloc_page(hpage)
+  __SetPageLocked(new_page)
+  new_page->index = start // hpage->index=new offset
+  new_page->mapping = mapping // mapping becomes valid again
+
+                            // since compound_head(page) == hpage
+                            // page_to_pgoff(page) got changed
+                            VM_BUG_ON_PAGE(page_to_pgoff(page) != offset)
+
+An initial patch replaced __SetPageLocked() by lock_page(), which did
+fix the race which Suren illustrates above.  But testing showed that it's
+not good enough: if the racing task's __lock_page() gets delayed long
+after its find_get_page(), then it may follow collapse_file(new start)'s
+successful final unlock_page(), and crash on the same VM_BUG_ON_PAGE.
+
+It could be fixed by relaxing filemap_fault()'s VM_BUG_ON_PAGE to a
+check and retry (as is done for mapping), with similar relaxations in
+find_lock_entry() and pagecache_get_page(): but it's not obvious what
+else might get caught out; and khugepaged non-NUMA appears to be unique
+in exposing a page to page cache, then revoking, without going through
+a full cycle of freeing before reuse.
+
+Instead, non-NUMA khugepaged_prealloc_page() release the old page
+if anyone else has a reference to it (1% of cases when I tested).
+
+Although never reported on huge tmpfs, I believe its find_lock_entry()
+has been at similar risk; but huge tmpfs does not rely on khugepaged
+for its normal working nearly so much as READ_ONLY_THP_FOR_FS does.
+
+Reported-by: Denis Lisov <dennis.lissov@gmail.com>
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=206569
+Link: https://lore.kernel.org/linux-mm/?q=20200219144635.3b7417145de19b65f258c943%40linux-foundation.org
+Reported-by: Qian Cai <cai@lca.pw>
+Link: https://lore.kernel.org/linux-xfs/?q=20200616013309.GB815%40lca.pw
+Reported-and-analyzed-by: Suren Baghdasaryan <surenb@google.com>
+Fixes: 87c460a0bded ("mm/khugepaged: collapse_shmem() without freezing new_page")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: stable@vger.kernel.org # v4.9+
+Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/khugepaged.c |   12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -802,6 +802,18 @@ static struct page *khugepaged_alloc_hug
+ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+ {
++      /*
++       * If the hpage allocated earlier was briefly exposed in page cache
++       * before collapse_file() failed, it is possible that racing lookups
++       * have not yet completed, and would then be unpleasantly surprised by
++       * finding the hpage reused for the same mapping at a different offset.
++       * Just release the previous allocation if there is any danger of that.
++       */
++      if (*hpage && page_count(*hpage) > 1) {
++              put_page(*hpage);
++              *hpage = NULL;
++      }
++
+       if (!*hpage)
+               *hpage = khugepaged_alloc_hugepage(wait);
index b4864042880d3d75a942e71fc93426a21c06f32b..f234fdd0a9fe8d3131ebcfd120e90453df564965 100644 (file)
@@ -33,3 +33,4 @@ perf-top-fix-stdio-interface-input-handling-with-glibc-2.28.patch
 mtd-rawnand-sunxi-fix-the-probe-error-path.patch
 ftrace-move-rcu-is-watching-check-after-recursion-check.patch
 macsec-avoid-use-after-free-in-macsec_handle_frame.patch
+mm-khugepaged-fix-filemap-page_to_pgoff-page-offset.patch