5.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 11 Oct 2020 08:48:19 +0000 (10:48 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 11 Oct 2020 08:48:19 +0000 (10:48 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 11 Oct 2020 08:48:19 +0000 (10:48 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 11 Oct 2020 08:48:19 +0000 (10:48 +0200)
diff --git a/queue-5.4/mm-khugepaged-fix-filemap-page_to_pgoff-page-offset.patch b/queue-5.4/mm-khugepaged-fix-filemap-page_to_pgoff-page-offset.patch

new file mode 100644 (file)

index 0000000..577726d
--- /dev/null
+++ b/queue-5.4/mm-khugepaged-fix-filemap-page_to_pgoff-page-offset.patch
@@ -0,0 +1,106 @@
+From 033b5d77551167f8c24ca862ce83d3e0745f9245 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 9 Oct 2020 20:07:59 -0700
+Subject: mm/khugepaged: fix filemap page_to_pgoff(page) != offset
+
+From: Hugh Dickins <hughd@google.com>
+
+commit 033b5d77551167f8c24ca862ce83d3e0745f9245 upstream.
+
+There have been elusive reports of filemap_fault() hitting its
+VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page) on kernels built
+with CONFIG_READ_ONLY_THP_FOR_FS=y.
+
+Suren has hit it on a kernel with CONFIG_READ_ONLY_THP_FOR_FS=y and
+CONFIG_NUMA is not set: and he has analyzed it down to how khugepaged
+without NUMA reuses the same huge page after collapse_file() failed
+(whereas NUMA targets its allocation to the respective node each time).
+And most of us were usually testing with CONFIG_NUMA=y kernels.
+
+collapse_file(old start)
+  new_page = khugepaged_alloc_page(hpage)
+  __SetPageLocked(new_page)
+  new_page->index = start // hpage->index=old offset
+  new_page->mapping = mapping
+  xas_store(&xas, new_page)
+
+                          filemap_fault
+                            page = find_get_page(mapping, offset)
+                            // if offset falls inside hpage then
+                            // compound_head(page) == hpage
+                            lock_page_maybe_drop_mmap()
+                              __lock_page(page)
+
+  // collapse fails
+  xas_store(&xas, old page)
+  new_page->mapping = NULL
+  unlock_page(new_page)
+
+collapse_file(new start)
+  new_page = khugepaged_alloc_page(hpage)
+  __SetPageLocked(new_page)
+  new_page->index = start // hpage->index=new offset
+  new_page->mapping = mapping // mapping becomes valid again
+
+                            // since compound_head(page) == hpage
+                            // page_to_pgoff(page) got changed
+                            VM_BUG_ON_PAGE(page_to_pgoff(page) != offset)
+
+An initial patch replaced __SetPageLocked() by lock_page(), which did
+fix the race which Suren illustrates above.  But testing showed that it's
+not good enough: if the racing task's __lock_page() gets delayed long
+after its find_get_page(), then it may follow collapse_file(new start)'s
+successful final unlock_page(), and crash on the same VM_BUG_ON_PAGE.
+
+It could be fixed by relaxing filemap_fault()'s VM_BUG_ON_PAGE to a
+check and retry (as is done for mapping), with similar relaxations in
+find_lock_entry() and pagecache_get_page(): but it's not obvious what
+else might get caught out; and khugepaged non-NUMA appears to be unique
+in exposing a page to page cache, then revoking, without going through
+a full cycle of freeing before reuse.
+
+Instead, non-NUMA khugepaged_prealloc_page() release the old page
+if anyone else has a reference to it (1% of cases when I tested).
+
+Although never reported on huge tmpfs, I believe its find_lock_entry()
+has been at similar risk; but huge tmpfs does not rely on khugepaged
+for its normal working nearly so much as READ_ONLY_THP_FOR_FS does.
+
+Reported-by: Denis Lisov <dennis.lissov@gmail.com>
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=206569
+Link: https://lore.kernel.org/linux-mm/?q=20200219144635.3b7417145de19b65f258c943%40linux-foundation.org
+Reported-by: Qian Cai <cai@lca.pw>
+Link: https://lore.kernel.org/linux-xfs/?q=20200616013309.GB815%40lca.pw
+Reported-and-analyzed-by: Suren Baghdasaryan <surenb@google.com>
+Fixes: 87c460a0bded ("mm/khugepaged: collapse_shmem() without freezing new_page")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: stable@vger.kernel.org # v4.9+
+Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/khugepaged.c |   12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -832,6 +832,18 @@ static struct page *khugepaged_alloc_hug
+ 
+ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+ {
++      /*
++       * If the hpage allocated earlier was briefly exposed in page cache
++       * before collapse_file() failed, it is possible that racing lookups
++       * have not yet completed, and would then be unpleasantly surprised by
++       * finding the hpage reused for the same mapping at a different offset.
++       * Just release the previous allocation if there is any danger of that.
++       */
++      if (*hpage && page_count(*hpage) > 1) {
++              put_page(*hpage);
++              *hpage = NULL;
++      }
++
+       if (!*hpage)
+               *hpage = khugepaged_alloc_hugepage(wait);
+ 
diff --git a/queue-5.4/net-introduce-helper-sendpage_ok-in-include-linux-net.h.patch b/queue-5.4/net-introduce-helper-sendpage_ok-in-include-linux-net.h.patch

new file mode 100644 (file)

index 0000000..3ac857b
--- /dev/null
+++ b/queue-5.4/net-introduce-helper-sendpage_ok-in-include-linux-net.h.patch
@@ -0,0 +1,76 @@
+From c381b07941adc2274ce552daf86c94701c5e265a Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 2 Oct 2020 16:27:28 +0800
+Subject: net: introduce helper sendpage_ok() in include/linux/net.h
+
+From: Coly Li <colyli@suse.de>
+
+commit c381b07941adc2274ce552daf86c94701c5e265a upstream.
+
+The original problem was from nvme-over-tcp code, who mistakenly uses
+kernel_sendpage() to send pages allocated by __get_free_pages() without
+__GFP_COMP flag. Such pages don't have refcount (page_count is 0) on
+tail pages, sending them by kernel_sendpage() may trigger a kernel panic
+from a corrupted kernel heap, because these pages are incorrectly freed
+in network stack as page_count 0 pages.
+
+This patch introduces a helper sendpage_ok(), it returns true if the
+checking page,
+- is not slab page: PageSlab(page) is false.
+- has page refcount: page_count(page) is not zero
+
+All drivers who want to send page to remote end by kernel_sendpage()
+may use this helper to check whether the page is OK. If the helper does
+not return true, the driver should try other non sendpage method (e.g.
+sock_no_sendpage()) to handle the page.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jan Kara <jack@suse.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Mikhail Skorzhinskii <mskorzhinskiy@solarflare.com>
+Cc: Philipp Reisner <philipp.reisner@linbit.com>
+Cc: Sagi Grimberg <sagi@grimberg.me>
+Cc: Vlastimil Babka <vbabka@suse.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/net.h |   16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+--- a/include/linux/net.h
++++ b/include/linux/net.h
+@@ -21,6 +21,7 @@
+ #include <linux/rcupdate.h>
+ #include <linux/once.h>
+ #include <linux/fs.h>
++#include <linux/mm.h>
+ 
+ #include <uapi/linux/net.h>
+ 
+@@ -288,6 +289,21 @@ do {                                                                      \
+ #define net_get_random_once_wait(buf, nbytes)                 \
+       get_random_once_wait((buf), (nbytes))
+ 
++/*
++ * E.g. XFS meta- & log-data is in slab pages, or bcache meta
++ * data pages, or other high order pages allocated by
++ * __get_free_pages() without __GFP_COMP, which have a page_count
++ * of 0 and/or have PageSlab() set. We cannot use send_page for
++ * those, as that does get_page(); put_page(); and would cause
++ * either a VM_BUG directly, or __page_cache_release a page that
++ * would actually still be referenced by someone, leading to some
++ * obscure delayed Oops somewhere else.
++ */
++static inline bool sendpage_ok(struct page *page)
++{
++      return !PageSlab(page) && page_count(page) >= 1;
++}
++
+ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec,
+                  size_t num, size_t len);
+ int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
diff --git a/queue-5.4/nvme-tcp-check-page-by-sendpage_ok-before-calling-kernel_sendpage.patch b/queue-5.4/nvme-tcp-check-page-by-sendpage_ok-before-calling-kernel_sendpage.patch

new file mode 100644 (file)

index 0000000..c990545
--- /dev/null
+++ b/queue-5.4/nvme-tcp-check-page-by-sendpage_ok-before-calling-kernel_sendpage.patch
@@ -0,0 +1,58 @@
+From 7d4194abfc4de13a2663c7fee6891de8360f7a52 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 2 Oct 2020 16:27:30 +0800
+Subject: nvme-tcp: check page by sendpage_ok() before calling kernel_sendpage()
+
+From: Coly Li <colyli@suse.de>
+
+commit 7d4194abfc4de13a2663c7fee6891de8360f7a52 upstream.
+
+Currently nvme_tcp_try_send_data() doesn't use kernel_sendpage() to
+send slab pages. But for pages allocated by __get_free_pages() without
+__GFP_COMP, which also have refcount as 0, they are still sent by
+kernel_sendpage() to remote end, this is problematic.
+
+The new introduced helper sendpage_ok() checks both PageSlab tag and
+page_count counter, and returns true if the checking page is OK to be
+sent by kernel_sendpage().
+
+This patch fixes the page checking issue of nvme_tcp_try_send_data()
+with sendpage_ok(). If sendpage_ok() returns true, send this page by
+kernel_sendpage(), otherwise use sock_no_sendpage to handle this page.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Hannes Reinecke <hare@suse.de>
+Cc: Jan Kara <jack@suse.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Mikhail Skorzhinskii <mskorzhinskiy@solarflare.com>
+Cc: Philipp Reisner <philipp.reisner@linbit.com>
+Cc: Sagi Grimberg <sagi@grimberg.me>
+Cc: Vlastimil Babka <vbabka@suse.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/nvme/host/tcp.c |    7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/drivers/nvme/host/tcp.c
++++ b/drivers/nvme/host/tcp.c
+@@ -861,12 +861,11 @@ static int nvme_tcp_try_send_data(struct
+               else
+                       flags |= MSG_MORE;
+ 
+-              /* can't zcopy slab pages */
+-              if (unlikely(PageSlab(page))) {
+-                      ret = sock_no_sendpage(queue->sock, page, offset, len,
++              if (sendpage_ok(page)) {
++                      ret = kernel_sendpage(queue->sock, page, offset, len,
+                                       flags);
+               } else {
+-                      ret = kernel_sendpage(queue->sock, page, offset, len,
++                      ret = sock_no_sendpage(queue->sock, page, offset, len,
+                                       flags);
+               }
+               if (ret <= 0)
diff --git a/queue-5.4/series b/queue-5.4/series

index 2e94331afed814a5fc2feeb347675def57ab8b20..f1ca3e6fa0f00e5f443d1f9891e175f5a46bf616 100644 (file)
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -31,3 +31,7 @@ btrfs-fix-rwf_nowait-write-not-failling-when-we-need-to-cow.patch
  btrfs-allow-btrfs_truncate_block-to-fallback-to-nocow-for-data-space-reservation.patch
  nvme-core-put-ctrl-ref-when-module-ref-get-fail.patch
  macsec-avoid-use-after-free-in-macsec_handle_frame.patch
+mm-khugepaged-fix-filemap-page_to_pgoff-page-offset.patch
+net-introduce-helper-sendpage_ok-in-include-linux-net.h.patch
+tcp-use-sendpage_ok-to-detect-misused-.sendpage.patch
+nvme-tcp-check-page-by-sendpage_ok-before-calling-kernel_sendpage.patch
diff --git a/queue-5.4/tcp-use-sendpage_ok-to-detect-misused-.sendpage.patch b/queue-5.4/tcp-use-sendpage_ok-to-detect-misused-.sendpage.patch

new file mode 100644 (file)

index 0000000..531ea44
--- /dev/null
+++ b/queue-5.4/tcp-use-sendpage_ok-to-detect-misused-.sendpage.patch
@@ -0,0 +1,46 @@
+From cf83a17edeeb36195596d2dae060a7c381db35f1 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 2 Oct 2020 16:27:31 +0800
+Subject: tcp: use sendpage_ok() to detect misused .sendpage
+
+From: Coly Li <colyli@suse.de>
+
+commit cf83a17edeeb36195596d2dae060a7c381db35f1 upstream.
+
+commit a10674bf2406 ("tcp: detecting the misuse of .sendpage for Slab
+objects") adds the checks for Slab pages, but the pages don't have
+page_count are still missing from the check.
+
+Network layer's sendpage method is not designed to send page_count 0
+pages neither, therefore both PageSlab() and page_count() should be
+both checked for the sending page. This is exactly what sendpage_ok()
+does.
+
+This patch uses sendpage_ok() in do_tcp_sendpages() to detect misused
+.sendpage, to make the code more robust.
+
+Fixes: a10674bf2406 ("tcp: detecting the misuse of .sendpage for Slab objects")
+Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Vasily Averin <vvs@virtuozzo.com>
+Cc: David S. Miller <davem@davemloft.net>
+Cc: stable@vger.kernel.org
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv4/tcp.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -971,7 +971,8 @@ ssize_t do_tcp_sendpages(struct sock *sk
+       long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ 
+       if (IS_ENABLED(CONFIG_DEBUG_VM) &&
+-          WARN_ONCE(PageSlab(page), "page must not be a Slab one"))
++          WARN_ONCE(!sendpage_ok(page),
++                    "page must not be a Slab one and have page_count > 0"))
+               return -EINVAL;
+ 
+       /* Wait for a connection to finish. One exception is TCP Fast Open
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 11 Oct 2020 08:48:19 +0000 (10:48 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 11 Oct 2020 08:48:19 +0000 (10:48 +0200)
queue-5.4/mm-khugepaged-fix-filemap-page_to_pgoff-page-offset.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/net-introduce-helper-sendpage_ok-in-include-linux-net.h.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/nvme-tcp-check-page-by-sendpage_ok-before-calling-kernel_sendpage.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/series		patch \| blob \| blame \| history
queue-5.4/tcp-use-sendpage_ok-to-detect-misused-.sendpage.patch	[new file with mode: 0644]	patch \| blob