hugepage backports for 4.9

author Sasha Levin <sashal@kernel.org>

Sun, 2 Dec 2018 08:31:57 +0000 (03:31 -0500)

committer Sasha Levin <sashal@kernel.org>

Sun, 2 Dec 2018 08:31:57 +0000 (03:31 -0500)
author Sasha Levin <sashal@kernel.org>
Sun, 2 Dec 2018 08:31:57 +0000 (03:31 -0500)
committer Sasha Levin <sashal@kernel.org>
Sun, 2 Dec 2018 08:31:57 +0000 (03:31 -0500)
diff --git a/queue-4.9/mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_siz.patch b/queue-4.9/mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_siz.patch

new file mode 100644 (file)

index 0000000..e61b76a
--- /dev/null
+++ b/queue-4.9/mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_siz.patch
@@ -0,0 +1,104 @@
+From 6a8f31910d7a4f229891283ae6858814deb55b89 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 30 Nov 2018 14:10:21 -0800
+Subject: mm/huge_memory: fix lockdep complaint on 32-bit i_size_read()
+
+commit 006d3ff27e884f80bd7d306b041afc415f63598f upstream.
+
+Huge tmpfs testing, on 32-bit kernel with lockdep enabled, showed that
+__split_huge_page() was using i_size_read() while holding the irq-safe
+lru_lock and page tree lock, but the 32-bit i_size_read() uses an
+irq-unsafe seqlock which should not be nested inside them.
+
+Instead, read the i_size earlier in split_huge_page_to_list(), and pass
+the end offset down to __split_huge_page(): all while holding head page
+lock, which is enough to prevent truncation of that extent before the
+page tree lock has been taken.
+
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261520070.2275@eggly.anvils
+Fixes: baa355fd33142 ("thp: file pages support for split_huge_page()")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/huge_memory.c | 19 +++++++++++++------
+ 1 file changed, 13 insertions(+), 6 deletions(-)
+
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 5beb62fa3d30..7ea8da990b9d 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1925,12 +1925,11 @@ static void __split_huge_page_tail(struct page *head, int tail,
+ }
+ 
+ static void __split_huge_page(struct page *page, struct list_head *list,
+-              unsigned long flags)
++              pgoff_t end, unsigned long flags)
+ {
+       struct page *head = compound_head(page);
+       struct zone *zone = page_zone(head);
+       struct lruvec *lruvec;
+-      pgoff_t end = -1;
+       int i;
+ 
+       lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
+@@ -1938,9 +1937,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
+       /* complete memcg works before add pages to LRU */
+       mem_cgroup_split_huge_fixup(head);
+ 
+-      if (!PageAnon(page))
+-              end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
+-
+       for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+               __split_huge_page_tail(head, i, lruvec, list);
+               /* Some pages can be beyond i_size: drop them from page cache */
+@@ -2093,6 +2089,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
+       int count, mapcount, extra_pins, ret;
+       bool mlocked;
+       unsigned long flags;
++      pgoff_t end;
+ 
+       VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+@@ -2114,6 +2111,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
+                       goto out;
+               }
+               extra_pins = 0;
++              end = -1;
+               mapping = NULL;
+               anon_vma_lock_write(anon_vma);
+       } else {
+@@ -2129,6 +2127,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
+               extra_pins = HPAGE_PMD_NR;
+               anon_vma = NULL;
+               i_mmap_lock_read(mapping);
++
++              /*
++               *__split_huge_page() may need to trim off pages beyond EOF:
++               * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
++               * which cannot be nested inside the page tree lock. So note
++               * end now: i_size itself may be changed at any moment, but
++               * head page lock is good enough to serialize the trimming.
++               */
++              end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+       }
+ 
+       /*
+@@ -2178,7 +2185,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
+               if (mapping)
+                       __dec_node_page_state(page, NR_SHMEM_THPS);
+               spin_unlock(&pgdata->split_queue_lock);
+-              __split_huge_page(page, list, flags);
++              __split_huge_page(page, list, end, flags);
+               ret = 0;
+       } else {
+               if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
+-- 
+2.17.1
+
diff --git a/queue-4.9/mm-huge_memory-rename-freeze_page-to-unmap_page.patch b/queue-4.9/mm-huge_memory-rename-freeze_page-to-unmap_page.patch

new file mode 100644 (file)

index 0000000..cd888e8
--- /dev/null
+++ b/queue-4.9/mm-huge_memory-rename-freeze_page-to-unmap_page.patch
@@ -0,0 +1,92 @@
+From 64fb3b48ff594d5eea8fd559819249b334e11c92 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 30 Nov 2018 14:10:13 -0800
+Subject: mm/huge_memory: rename freeze_page() to unmap_page()
+
+commit 906f9cdfc2a0800f13683f9e4ebdfd08c12ee81b upstream.
+
+The term "freeze" is used in several ways in the kernel, and in mm it
+has the particular meaning of forcing page refcount temporarily to 0.
+freeze_page() is just too confusing a name for a function that unmaps a
+page: rename it unmap_page(), and rename unfreeze_page() remap_page().
+
+Went to change the mention of freeze_page() added later in mm/rmap.c,
+but found it to be incorrect: ordinary page reclaim reaches there too;
+but the substance of the comment still seems correct, so edit it down.
+
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261514080.2275@eggly.anvils
+Fixes: e9b61f19858a5 ("thp: reintroduce split_huge_page()")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/huge_memory.c | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 9f7bba700e4e..583ad61cc2f1 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1839,7 +1839,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
+       }
+ }
+ 
+-static void freeze_page(struct page *page)
++static void unmap_page(struct page *page)
+ {
+       enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
+               TTU_RMAP_LOCKED;
+@@ -1862,7 +1862,7 @@ static void freeze_page(struct page *page)
+       VM_BUG_ON_PAGE(ret, page + i - 1);
+ }
+ 
+-static void unfreeze_page(struct page *page)
++static void remap_page(struct page *page)
+ {
+       int i;
+ 
+@@ -1971,7 +1971,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
+ 
+       spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
+ 
+-      unfreeze_page(head);
++      remap_page(head);
+ 
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               struct page *subpage = head + i;
+@@ -2138,7 +2138,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
+       }
+ 
+       /*
+-       * Racy check if we can split the page, before freeze_page() will
++       * Racy check if we can split the page, before unmap_page() will
+        * split PMDs
+        */
+       if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
+@@ -2147,7 +2147,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
+       }
+ 
+       mlocked = PageMlocked(page);
+-      freeze_page(head);
++      unmap_page(head);
+       VM_BUG_ON_PAGE(compound_mapcount(head), head);
+ 
+       /* Make sure the page is not on per-CPU pagevec as it takes pin */
+@@ -2199,7 +2199,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
+ fail:         if (mapping)
+                       spin_unlock(&mapping->tree_lock);
+               spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
+-              unfreeze_page(head);
++              remap_page(head);
+               ret = -EBUSY;
+       }
+ 
+-- 
+2.17.1
+
diff --git a/queue-4.9/mm-huge_memory-splitting-set-mapping-index-before-un.patch b/queue-4.9/mm-huge_memory-splitting-set-mapping-index-before-un.patch

new file mode 100644 (file)

index 0000000..f44784c
--- /dev/null
+++ b/queue-4.9/mm-huge_memory-splitting-set-mapping-index-before-un.patch
@@ -0,0 +1,71 @@
+From a046d00b5ca0a24ae17eac1498d4f769bb166eb1 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 30 Nov 2018 14:10:16 -0800
+Subject: mm/huge_memory: splitting set mapping+index before unfreeze
+
+commit 173d9d9fd3ddae84c110fea8aedf1f26af6be9ec upstream.
+
+Huge tmpfs stress testing has occasionally hit shmem_undo_range()'s
+VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page).
+
+Move the setting of mapping and index up before the page_ref_unfreeze()
+in __split_huge_page_tail() to fix this: so that a page cache lookup
+cannot get a reference while the tail's mapping and index are unstable.
+
+In fact, might as well move them up before the smp_wmb(): I don't see an
+actual need for that, but if I'm missing something, this way round is
+safer than the other, and no less efficient.
+
+You might argue that VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page) is
+misplaced, and should be left until after the trylock_page(); but left as
+is has not crashed since, and gives more stringent assurance.
+
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261516380.2275@eggly.anvils
+Fixes: e9b61f19858a5 ("thp: reintroduce split_huge_page()")
+Requires: 605ca5ede764 ("mm/huge_memory.c: reorder operations in __split_huge_page_tail()")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/huge_memory.c | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index c14aec110e90..5beb62fa3d30 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1894,6 +1894,12 @@ static void __split_huge_page_tail(struct page *head, int tail,
+                        (1L << PG_unevictable) |
+                        (1L << PG_dirty)));
+ 
++      /* ->mapping in first tail page is compound_mapcount */
++      VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
++                      page_tail);
++      page_tail->mapping = head->mapping;
++      page_tail->index = head->index + tail;
++
+       /* Page flags must be visible before we make the page non-compound. */
+       smp_wmb();
+ 
+@@ -1914,12 +1920,6 @@ static void __split_huge_page_tail(struct page *head, int tail,
+       if (page_is_idle(head))
+               set_page_idle(page_tail);
+ 
+-      /* ->mapping in first tail page is compound_mapcount */
+-      VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+-                      page_tail);
+-      page_tail->mapping = head->mapping;
+-
+-      page_tail->index = head->index + tail;
+       page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+       lru_add_page_tail(head, page_tail, lruvec, list);
+ }
+-- 
+2.17.1
+
diff --git a/queue-4.9/mm-huge_memory.c-reorder-operations-in-__split_huge_.patch b/queue-4.9/mm-huge_memory.c-reorder-operations-in-__split_huge_.patch

new file mode 100644 (file)

index 0000000..c4eb233
--- /dev/null
+++ b/queue-4.9/mm-huge_memory.c-reorder-operations-in-__split_huge_.patch
@@ -0,0 +1,105 @@
+From 9a20eb64d247afb1bcb505380745460e4246e19b Mon Sep 17 00:00:00 2001
+From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Date: Thu, 5 Apr 2018 16:23:28 -0700
+Subject: mm/huge_memory.c: reorder operations in __split_huge_page_tail()
+
+commit 605ca5ede7643a01f4c4a15913f9714ac297f8a6 upstream.
+
+THP split makes non-atomic change of tail page flags.  This is almost ok
+because tail pages are locked and isolated but this breaks recent
+changes in page locking: non-atomic operation could clear bit
+PG_waiters.
+
+As a result concurrent sequence get_page_unless_zero() -> lock_page()
+might block forever.  Especially if this page was truncated later.
+
+Fix is trivial: clone flags before unfreezing page reference counter.
+
+This race exists since commit 62906027091f ("mm: add PageWaiters
+indicating tasks are waiting for a page bit") while unsave unfreeze
+itself was added in commit 8df651c7059e ("thp: cleanup
+split_huge_page()").
+
+clear_compound_head() also must be called before unfreezing page
+reference because after successful get_page_unless_zero() might follow
+put_page() which needs correct compound_head().
+
+And replace page_ref_inc()/page_ref_add() with page_ref_unfreeze() which
+is made especially for that and has semantic of smp_store_release().
+
+Link: http://lkml.kernel.org/r/151844393341.210639.13162088407980624477.stgit@buzz
+Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Nicholas Piggin <npiggin@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/huge_memory.c | 36 +++++++++++++++---------------------
+ 1 file changed, 15 insertions(+), 21 deletions(-)
+
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 583ad61cc2f1..c14aec110e90 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1876,26 +1876,13 @@ static void __split_huge_page_tail(struct page *head, int tail,
+       struct page *page_tail = head + tail;
+ 
+       VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
+-      VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
+ 
+       /*
+-       * tail_page->_refcount is zero and not changing from under us. But
+-       * get_page_unless_zero() may be running from under us on the
+-       * tail_page. If we used atomic_set() below instead of atomic_inc() or
+-       * atomic_add(), we would then run atomic_set() concurrently with
+-       * get_page_unless_zero(), and atomic_set() is implemented in C not
+-       * using locked ops. spin_unlock on x86 sometime uses locked ops
+-       * because of PPro errata 66, 92, so unless somebody can guarantee
+-       * atomic_set() here would be safe on all archs (and not only on x86),
+-       * it's safer to use atomic_inc()/atomic_add().
++       * Clone page flags before unfreezing refcount.
++       *
++       * After successful get_page_unless_zero() might follow flags change,
++       * for exmaple lock_page() which set PG_waiters.
+        */
+-      if (PageAnon(head)) {
+-              page_ref_inc(page_tail);
+-      } else {
+-              /* Additional pin to radix tree */
+-              page_ref_add(page_tail, 2);
+-      }
+-
+       page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+       page_tail->flags |= (head->flags &
+                       ((1L << PG_referenced) |
+@@ -1907,14 +1894,21 @@ static void __split_huge_page_tail(struct page *head, int tail,
+                        (1L << PG_unevictable) |
+                        (1L << PG_dirty)));
+ 
+-      /*
+-       * After clearing PageTail the gup refcount can be released.
+-       * Page flags also must be visible before we make the page non-compound.
+-       */
++      /* Page flags must be visible before we make the page non-compound. */
+       smp_wmb();
+ 
++      /*
++       * Clear PageTail before unfreezing page refcount.
++       *
++       * After successful get_page_unless_zero() might follow put_page()
++       * which needs correct compound_head().
++       */
+       clear_compound_head(page_tail);
+ 
++      /* Finally unfreeze refcount. Additional reference from page cache. */
++      page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
++                                        PageSwapCache(head)));
++
+       if (page_is_young(head))
+               set_page_young(page_tail);
+       if (page_is_idle(head))
+-- 
+2.17.1
+
diff --git a/queue-4.9/mm-khugepaged-collapse_shmem-do-not-crash-on-compoun.patch b/queue-4.9/mm-khugepaged-collapse_shmem-do-not-crash-on-compoun.patch

new file mode 100644 (file)

index 0000000..b4e34f8
--- /dev/null
+++ b/queue-4.9/mm-khugepaged-collapse_shmem-do-not-crash-on-compoun.patch
@@ -0,0 +1,53 @@
+From 7c5584653f9c189445f428398fed77081888049d Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 30 Nov 2018 14:10:47 -0800
+Subject: mm/khugepaged: collapse_shmem() do not crash on Compound
+
+commit 06a5e1268a5fb9c2b346a3da6b97e85f2eba0f07 upstream.
+
+collapse_shmem()'s VM_BUG_ON_PAGE(PageTransCompound) was unsafe: before
+it holds page lock of the first page, racing truncation then extension
+might conceivably have inserted a hugepage there already.  Fail with the
+SCAN_PAGE_COMPOUND result, instead of crashing (CONFIG_DEBUG_VM=y) or
+otherwise mishandling the unexpected hugepage - though later we might
+code up a more constructive way of handling it, with SCAN_SUCCESS.
+
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261529310.2275@eggly.anvils
+Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/khugepaged.c | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index b87bd43993bd..e0cfc3a54b6a 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1400,7 +1400,15 @@ static void collapse_shmem(struct mm_struct *mm,
+                */
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
+               VM_BUG_ON_PAGE(!PageUptodate(page), page);
+-              VM_BUG_ON_PAGE(PageTransCompound(page), page);
++
++              /*
++               * If file was truncated then extended, or hole-punched, before
++               * we locked the first page, then a THP might be there already.
++               */
++              if (PageTransCompound(page)) {
++                      result = SCAN_PAGE_COMPOUND;
++                      goto out_unlock;
++              }
+ 
+               if (page_mapping(page) != mapping) {
+                       result = SCAN_TRUNCATED;
+-- 
+2.17.1
+
diff --git a/queue-4.9/mm-khugepaged-collapse_shmem-remember-to-clear-holes.patch b/queue-4.9/mm-khugepaged-collapse_shmem-remember-to-clear-holes.patch

new file mode 100644 (file)

index 0000000..b2afc36
--- /dev/null
+++ b/queue-4.9/mm-khugepaged-collapse_shmem-remember-to-clear-holes.patch
@@ -0,0 +1,64 @@
+From f3954f5504d74b093c531e6c3f6f14ceef61ca57 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 30 Nov 2018 14:10:35 -0800
+Subject: mm/khugepaged: collapse_shmem() remember to clear holes
+
+commit 2af8ff291848cc4b1cce24b6c943394eb2c761e8 upstream.
+
+Huge tmpfs testing reminds us that there is no __GFP_ZERO in the gfp
+flags khugepaged uses to allocate a huge page - in all common cases it
+would just be a waste of effort - so collapse_shmem() must remember to
+clear out any holes that it instantiates.
+
+The obvious place to do so, where they are put into the page cache tree,
+is not a good choice: because interrupts are disabled there.  Leave it
+until further down, once success is assured, where the other pages are
+copied (before setting PageUptodate).
+
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261525080.2275@eggly.anvils
+Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/khugepaged.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 3f7bfd98b0e6..2d3ce49f6b45 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1501,7 +1501,12 @@ static void collapse_shmem(struct mm_struct *mm,
+                * Replacing old pages with new one has succeed, now we need to
+                * copy the content and free old pages.
+                */
++              index = start;
+               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
++                      while (index < page->index) {
++                              clear_highpage(new_page + (index % HPAGE_PMD_NR));
++                              index++;
++                      }
+                       copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
+                                       page);
+                       list_del(&page->lru);
+@@ -1511,6 +1516,11 @@ static void collapse_shmem(struct mm_struct *mm,
+                       ClearPageActive(page);
+                       ClearPageUnevictable(page);
+                       put_page(page);
++                      index++;
++              }
++              while (index < end) {
++                      clear_highpage(new_page + (index % HPAGE_PMD_NR));
++                      index++;
+               }
+ 
+               local_irq_save(flags);
+-- 
+2.17.1
+
diff --git a/queue-4.9/mm-khugepaged-collapse_shmem-stop-if-punched-or-trun.patch b/queue-4.9/mm-khugepaged-collapse_shmem-stop-if-punched-or-trun.patch

new file mode 100644 (file)

index 0000000..c8863fc
--- /dev/null
+++ b/queue-4.9/mm-khugepaged-collapse_shmem-stop-if-punched-or-trun.patch
@@ -0,0 +1,62 @@
+From e42882c7909cc73b15473155ce9e1393eb367845 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 30 Nov 2018 14:10:25 -0800
+Subject: mm/khugepaged: collapse_shmem() stop if punched or truncated
+
+commit 701270fa193aadf00bdcf607738f64997275d4c7 upstream.
+
+Huge tmpfs testing showed that although collapse_shmem() recognizes a
+concurrently truncated or hole-punched page correctly, its handling of
+holes was liable to refill an emptied extent.  Add check to stop that.
+
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261522040.2275@eggly.anvils
+Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Reviewed-by: Matthew Wilcox <willy@infradead.org>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/khugepaged.c | 15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 1df37ee996d5..62de24194f24 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1351,6 +1351,16 @@ static void collapse_shmem(struct mm_struct *mm,
+       radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+               int n = min(iter.index, end) - index;
+ 
++              /*
++               * Stop if extent has been hole-punched, and is now completely
++               * empty (the more obvious i_size_read() check would take an
++               * irq-unsafe seqlock on 32-bit).
++               */
++              if (n >= HPAGE_PMD_NR) {
++                      result = SCAN_TRUNCATED;
++                      goto tree_locked;
++              }
++
+               /*
+                * Handle holes in the radix tree: charge it from shmem and
+                * insert relevant subpage of new_page into the radix-tree.
+@@ -1462,6 +1472,11 @@ static void collapse_shmem(struct mm_struct *mm,
+       if (result == SCAN_SUCCEED && index < end) {
+               int n = end - index;
+ 
++              /* Stop if extent has been truncated, and is now empty */
++              if (n >= HPAGE_PMD_NR) {
++                      result = SCAN_TRUNCATED;
++                      goto tree_locked;
++              }
+               if (!shmem_charge(mapping->host, n)) {
+                       result = SCAN_FAIL;
+                       goto tree_locked;
+-- 
+2.17.1
+
diff --git a/queue-4.9/mm-khugepaged-collapse_shmem-without-freezing-new_pa.patch b/queue-4.9/mm-khugepaged-collapse_shmem-without-freezing-new_pa.patch

new file mode 100644 (file)

index 0000000..eb91f44
--- /dev/null
+++ b/queue-4.9/mm-khugepaged-collapse_shmem-without-freezing-new_pa.patch
@@ -0,0 +1,117 @@
+From 7736d26bcb16e54f444a8ee3f562b4ad43dd88ac Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 30 Nov 2018 14:10:43 -0800
+Subject: mm/khugepaged: collapse_shmem() without freezing new_page
+
+commit 87c460a0bded56195b5eb497d44709777ef7b415 upstream.
+
+khugepaged's collapse_shmem() does almost all of its work, to assemble
+the huge new_page from 512 scattered old pages, with the new_page's
+refcount frozen to 0 (and refcounts of all old pages so far also frozen
+to 0).  Including shmem_getpage() to read in any which were out on swap,
+memory reclaim if necessary to allocate their intermediate pages, and
+copying over all the data from old to new.
+
+Imagine the frozen refcount as a spinlock held, but without any lock
+debugging to highlight the abuse: it's not good, and under serious load
+heads into lockups - speculative getters of the page are not expecting
+to spin while khugepaged is rescheduled.
+
+One can get a little further under load by hacking around elsewhere; but
+fortunately, freezing the new_page turns out to have been entirely
+unnecessary, with no hacks needed elsewhere.
+
+The huge new_page lock is already held throughout, and guards all its
+subpages as they are brought one by one into the page cache tree; and
+anything reading the data in that page, without the lock, before it has
+been marked PageUptodate, would already be in the wrong.  So simply
+eliminate the freezing of the new_page.
+
+Each of the old pages remains frozen with refcount 0 after it has been
+replaced by a new_page subpage in the page cache tree, until they are
+all unfrozen on success or failure: just as before.  They could be
+unfrozen sooner, but cause no problem once no longer visible to
+find_get_entry(), filemap_map_pages() and other speculative lookups.
+
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261527570.2275@eggly.anvils
+Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/khugepaged.c | 19 +++++++------------
+ 1 file changed, 7 insertions(+), 12 deletions(-)
+
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 47b83030fc53..b87bd43993bd 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1286,7 +1286,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+  * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
+  *
+  * Basic scheme is simple, details are more complex:
+- *  - allocate and freeze a new huge page;
++ *  - allocate and lock a new huge page;
+  *  - scan over radix tree replacing old pages the new one
+  *    + swap in pages if necessary;
+  *    + fill in gaps;
+@@ -1294,11 +1294,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+  *  - if replacing succeed:
+  *    + copy data over;
+  *    + free old pages;
+- *    + unfreeze huge page;
++ *    + unlock huge page;
+  *  - if replacing failed;
+  *    + put all pages back and unfreeze them;
+  *    + restore gaps in the radix-tree;
+- *    + free huge page;
++ *    + unlock and free huge page;
+  */
+ static void collapse_shmem(struct mm_struct *mm,
+               struct address_space *mapping, pgoff_t start,
+@@ -1336,13 +1336,11 @@ static void collapse_shmem(struct mm_struct *mm,
+       __SetPageSwapBacked(new_page);
+       new_page->index = start;
+       new_page->mapping = mapping;
+-      BUG_ON(!page_ref_freeze(new_page, 1));
+ 
+       /*
+-       * At this point the new_page is 'frozen' (page_count() is zero), locked
+-       * and not up-to-date. It's safe to insert it into radix tree, because
+-       * nobody would be able to map it or use it in other way until we
+-       * unfreeze it.
++       * At this point the new_page is locked and not up-to-date.
++       * It's safe to insert it into the page cache, because nobody would
++       * be able to map it or use it in another way until we unlock it.
+        */
+ 
+       index = start;
+@@ -1520,9 +1518,8 @@ static void collapse_shmem(struct mm_struct *mm,
+                       index++;
+               }
+ 
+-              /* Everything is ready, let's unfreeze the new_page */
+               SetPageUptodate(new_page);
+-              page_ref_unfreeze(new_page, HPAGE_PMD_NR);
++              page_ref_add(new_page, HPAGE_PMD_NR - 1);
+               set_page_dirty(new_page);
+               mem_cgroup_commit_charge(new_page, memcg, false, true);
+               lru_cache_add_anon(new_page);
+@@ -1570,8 +1567,6 @@ static void collapse_shmem(struct mm_struct *mm,
+               VM_BUG_ON(nr_none);
+               spin_unlock_irq(&mapping->tree_lock);
+ 
+-              /* Unfreeze new_page, caller would take care about freeing it */
+-              page_ref_unfreeze(new_page, 1);
+               mem_cgroup_cancel_charge(new_page, memcg, true);
+               new_page->mapping = NULL;
+       }
+-- 
+2.17.1
+
diff --git a/queue-4.9/mm-khugepaged-fix-crashes-due-to-misaccounted-holes.patch b/queue-4.9/mm-khugepaged-fix-crashes-due-to-misaccounted-holes.patch

new file mode 100644 (file)

index 0000000..0bd26cc
--- /dev/null
+++ b/queue-4.9/mm-khugepaged-fix-crashes-due-to-misaccounted-holes.patch
@@ -0,0 +1,92 @@
+From e4f8f8284389e7cf6b91adf3bed5ebe334ce7f10 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 30 Nov 2018 14:10:29 -0800
+Subject: mm/khugepaged: fix crashes due to misaccounted holes
+
+commit aaa52e340073b7f4593b3c4ddafcafa70cf838b5 upstream.
+
+Huge tmpfs testing on a shortish file mapped into a pmd-rounded extent
+hit shmem_evict_inode()'s WARN_ON(inode->i_blocks) followed by
+clear_inode()'s BUG_ON(inode->i_data.nrpages) when the file was later
+closed and unlinked.
+
+khugepaged's collapse_shmem() was forgetting to update mapping->nrpages
+on the rollback path, after it had added but then needs to undo some
+holes.
+
+There is indeed an irritating asymmetry between shmem_charge(), whose
+callers want it to increment nrpages after successfully accounting
+blocks, and shmem_uncharge(), when __delete_from_page_cache() already
+decremented nrpages itself: oh well, just add a comment on that to them
+both.
+
+And shmem_recalc_inode() is supposed to be called when the accounting is
+expected to be in balance (so it can deduce from imbalance that reclaim
+discarded some pages): so change shmem_charge() to update nrpages
+earlier (though it's rare for the difference to matter at all).
+
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261523450.2275@eggly.anvils
+Fixes: 800d8c63b2e98 ("shmem: add huge pages support")
+Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/khugepaged.c | 4 +++-
+ mm/shmem.c      | 6 +++++-
+ 2 files changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 62de24194f24..3f7bfd98b0e6 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1538,8 +1538,10 @@ static void collapse_shmem(struct mm_struct *mm,
+               *hpage = NULL;
+       } else {
+               /* Something went wrong: rollback changes to the radix-tree */
+-              shmem_uncharge(mapping->host, nr_none);
+               spin_lock_irq(&mapping->tree_lock);
++              mapping->nrpages -= nr_none;
++              shmem_uncharge(mapping->host, nr_none);
++
+               radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
+                               start) {
+                       if (iter.index >= end)
+diff --git a/mm/shmem.c b/mm/shmem.c
+index e30ffaa065a4..54911bbc74d6 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -283,12 +283,14 @@ bool shmem_charge(struct inode *inode, long pages)
+       if (!shmem_inode_acct_block(inode, pages))
+               return false;
+ 
++      /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
++      inode->i_mapping->nrpages += pages;
++
+       spin_lock_irqsave(&info->lock, flags);
+       info->alloced += pages;
+       inode->i_blocks += pages * BLOCKS_PER_PAGE;
+       shmem_recalc_inode(inode);
+       spin_unlock_irqrestore(&info->lock, flags);
+-      inode->i_mapping->nrpages += pages;
+ 
+       return true;
+ }
+@@ -298,6 +300,8 @@ void shmem_uncharge(struct inode *inode, long pages)
+       struct shmem_inode_info *info = SHMEM_I(inode);
+       unsigned long flags;
+ 
++      /* nrpages adjustment done by __delete_from_page_cache() or caller */
++
+       spin_lock_irqsave(&info->lock, flags);
+       info->alloced -= pages;
+       inode->i_blocks -= pages * BLOCKS_PER_PAGE;
+-- 
+2.17.1
+
diff --git a/queue-4.9/mm-khugepaged-minor-reorderings-in-collapse_shmem.patch b/queue-4.9/mm-khugepaged-minor-reorderings-in-collapse_shmem.patch

new file mode 100644 (file)

index 0000000..e68389c
--- /dev/null
+++ b/queue-4.9/mm-khugepaged-minor-reorderings-in-collapse_shmem.patch
@@ -0,0 +1,234 @@
+From b32ac19d2e6cfa7bb93ac3b9ef6172a0399e2c1d Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Fri, 30 Nov 2018 14:10:39 -0800
+Subject: mm/khugepaged: minor reorderings in collapse_shmem()
+
+commit 042a30824871fa3149b0127009074b75cc25863c upstream.
+
+Several cleanups in collapse_shmem(): most of which probably do not
+really matter, beyond doing things in a more familiar and reassuring
+order.  Simplify the failure gotos in the main loop, and on success
+update stats while interrupts still disabled from the last iteration.
+
+Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261526400.2275@eggly.anvils
+Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages")
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Jerome Glisse <jglisse@redhat.com>
+Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: <stable@vger.kernel.org>   [4.8+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/khugepaged.c | 73 ++++++++++++++++++++-----------------------------
+ 1 file changed, 30 insertions(+), 43 deletions(-)
+
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 2d3ce49f6b45..47b83030fc53 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1332,13 +1332,12 @@ static void collapse_shmem(struct mm_struct *mm,
+               goto out;
+       }
+ 
++      __SetPageLocked(new_page);
++      __SetPageSwapBacked(new_page);
+       new_page->index = start;
+       new_page->mapping = mapping;
+-      __SetPageSwapBacked(new_page);
+-      __SetPageLocked(new_page);
+       BUG_ON(!page_ref_freeze(new_page, 1));
+ 
+-
+       /*
+        * At this point the new_page is 'frozen' (page_count() is zero), locked
+        * and not up-to-date. It's safe to insert it into radix tree, because
+@@ -1367,13 +1366,13 @@ static void collapse_shmem(struct mm_struct *mm,
+                */
+               if (n && !shmem_charge(mapping->host, n)) {
+                       result = SCAN_FAIL;
+-                      break;
++                      goto tree_locked;
+               }
+-              nr_none += n;
+               for (; index < min(iter.index, end); index++) {
+                       radix_tree_insert(&mapping->page_tree, index,
+                                       new_page + (index % HPAGE_PMD_NR));
+               }
++              nr_none += n;
+ 
+               /* We are done. */
+               if (index >= end)
+@@ -1389,12 +1388,12 @@ static void collapse_shmem(struct mm_struct *mm,
+                               result = SCAN_FAIL;
+                               goto tree_unlocked;
+                       }
+-                      spin_lock_irq(&mapping->tree_lock);
+               } else if (trylock_page(page)) {
+                       get_page(page);
++                      spin_unlock_irq(&mapping->tree_lock);
+               } else {
+                       result = SCAN_PAGE_LOCK;
+-                      break;
++                      goto tree_locked;
+               }
+ 
+               /*
+@@ -1409,11 +1408,10 @@ static void collapse_shmem(struct mm_struct *mm,
+                       result = SCAN_TRUNCATED;
+                       goto out_unlock;
+               }
+-              spin_unlock_irq(&mapping->tree_lock);
+ 
+               if (isolate_lru_page(page)) {
+                       result = SCAN_DEL_PAGE_LRU;
+-                      goto out_isolate_failed;
++                      goto out_unlock;
+               }
+ 
+               if (page_mapped(page))
+@@ -1435,7 +1433,9 @@ static void collapse_shmem(struct mm_struct *mm,
+                */
+               if (!page_ref_freeze(page, 3)) {
+                       result = SCAN_PAGE_COUNT;
+-                      goto out_lru;
++                      spin_unlock_irq(&mapping->tree_lock);
++                      putback_lru_page(page);
++                      goto out_unlock;
+               }
+ 
+               /*
+@@ -1451,17 +1451,10 @@ static void collapse_shmem(struct mm_struct *mm,
+               slot = radix_tree_iter_next(&iter);
+               index++;
+               continue;
+-out_lru:
+-              spin_unlock_irq(&mapping->tree_lock);
+-              putback_lru_page(page);
+-out_isolate_failed:
+-              unlock_page(page);
+-              put_page(page);
+-              goto tree_unlocked;
+ out_unlock:
+               unlock_page(page);
+               put_page(page);
+-              break;
++              goto tree_unlocked;
+       }
+ 
+       /*
+@@ -1469,7 +1462,7 @@ static void collapse_shmem(struct mm_struct *mm,
+        * This code only triggers if there's nothing in radix tree
+        * beyond 'end'.
+        */
+-      if (result == SCAN_SUCCEED && index < end) {
++      if (index < end) {
+               int n = end - index;
+ 
+               /* Stop if extent has been truncated, and is now empty */
+@@ -1481,7 +1474,6 @@ static void collapse_shmem(struct mm_struct *mm,
+                       result = SCAN_FAIL;
+                       goto tree_locked;
+               }
+-
+               for (; index < end; index++) {
+                       radix_tree_insert(&mapping->page_tree, index,
+                                       new_page + (index % HPAGE_PMD_NR));
+@@ -1489,14 +1481,19 @@ static void collapse_shmem(struct mm_struct *mm,
+               nr_none += n;
+       }
+ 
++      __inc_node_page_state(new_page, NR_SHMEM_THPS);
++      if (nr_none) {
++              struct zone *zone = page_zone(new_page);
++
++              __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
++              __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
++      }
++
+ tree_locked:
+       spin_unlock_irq(&mapping->tree_lock);
+ tree_unlocked:
+ 
+       if (result == SCAN_SUCCEED) {
+-              unsigned long flags;
+-              struct zone *zone = page_zone(new_page);
+-
+               /*
+                * Replacing old pages with new one has succeed, now we need to
+                * copy the content and free old pages.
+@@ -1510,11 +1507,11 @@ static void collapse_shmem(struct mm_struct *mm,
+                       copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
+                                       page);
+                       list_del(&page->lru);
+-                      unlock_page(page);
+-                      page_ref_unfreeze(page, 1);
+                       page->mapping = NULL;
++                      page_ref_unfreeze(page, 1);
+                       ClearPageActive(page);
+                       ClearPageUnevictable(page);
++                      unlock_page(page);
+                       put_page(page);
+                       index++;
+               }
+@@ -1523,28 +1520,17 @@ static void collapse_shmem(struct mm_struct *mm,
+                       index++;
+               }
+ 
+-              local_irq_save(flags);
+-              __inc_node_page_state(new_page, NR_SHMEM_THPS);
+-              if (nr_none) {
+-                      __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
+-                      __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
+-              }
+-              local_irq_restore(flags);
+-
+-              /*
+-               * Remove pte page tables, so we can re-faulti
+-               * the page as huge.
+-               */
+-              retract_page_tables(mapping, start);
+-
+               /* Everything is ready, let's unfreeze the new_page */
+-              set_page_dirty(new_page);
+               SetPageUptodate(new_page);
+               page_ref_unfreeze(new_page, HPAGE_PMD_NR);
++              set_page_dirty(new_page);
+               mem_cgroup_commit_charge(new_page, memcg, false, true);
+               lru_cache_add_anon(new_page);
+-              unlock_page(new_page);
+ 
++              /*
++               * Remove pte page tables, so we can re-fault the page as huge.
++               */
++              retract_page_tables(mapping, start);
+               *hpage = NULL;
+       } else {
+               /* Something went wrong: rollback changes to the radix-tree */
+@@ -1576,8 +1562,8 @@ static void collapse_shmem(struct mm_struct *mm,
+                       page_ref_unfreeze(page, 2);
+                       radix_tree_replace_slot(slot, page);
+                       spin_unlock_irq(&mapping->tree_lock);
+-                      putback_lru_page(page);
+                       unlock_page(page);
++                      putback_lru_page(page);
+                       spin_lock_irq(&mapping->tree_lock);
+                       slot = radix_tree_iter_next(&iter);
+               }
+@@ -1587,9 +1573,10 @@ static void collapse_shmem(struct mm_struct *mm,
+               /* Unfreeze new_page, caller would take care about freeing it */
+               page_ref_unfreeze(new_page, 1);
+               mem_cgroup_cancel_charge(new_page, memcg, true);
+-              unlock_page(new_page);
+               new_page->mapping = NULL;
+       }
++
++      unlock_page(new_page);
+ out:
+       VM_BUG_ON(!list_empty(&pagelist));
+       /* TODO: tracepoints */
+-- 
+2.17.1
+
diff --git a/queue-4.9/series b/queue-4.9/series

new file mode 100644 (file)

index 0000000..d25bcd1
--- /dev/null
+++ b/queue-4.9/series
@@ -0,0 +1,12 @@
+mm-huge_memory-rename-freeze_page-to-unmap_page.patch
+mm-huge_memory.c-reorder-operations-in-__split_huge_.patch
+mm-huge_memory-splitting-set-mapping-index-before-un.patch
+mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_siz.patch
+mm-khugepaged-collapse_shmem-stop-if-punched-or-trun.patch
+shmem-shmem_charge-verify-max_block-is-not-exceeded-.patch
+shmem-introduce-shmem_inode_acct_block.patch
+mm-khugepaged-fix-crashes-due-to-misaccounted-holes.patch
+mm-khugepaged-collapse_shmem-remember-to-clear-holes.patch
+mm-khugepaged-minor-reorderings-in-collapse_shmem.patch
+mm-khugepaged-collapse_shmem-without-freezing-new_pa.patch
+mm-khugepaged-collapse_shmem-do-not-crash-on-compoun.patch
diff --git a/queue-4.9/shmem-introduce-shmem_inode_acct_block.patch b/queue-4.9/shmem-introduce-shmem_inode_acct_block.patch

new file mode 100644 (file)

index 0000000..36c1444
--- /dev/null
+++ b/queue-4.9/shmem-introduce-shmem_inode_acct_block.patch
@@ -0,0 +1,198 @@
+From d29cd29cf934dea66d08c49f7a055abfa7d27257 Mon Sep 17 00:00:00 2001
+From: Mike Rapoport <rppt@linux.vnet.ibm.com>
+Date: Wed, 6 Sep 2017 16:22:59 -0700
+Subject: shmem: introduce shmem_inode_acct_block
+
+commit 0f0796945614b7523987f7eea32407421af4b1ee upstream.
+
+The shmem_acct_block and the update of used_blocks are following one
+another in all the places they are used.  Combine these two into a
+helper function.
+
+Link: http://lkml.kernel.org/r/1497939652-16528-3-git-send-email-rppt@linux.vnet.ibm.com
+Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Pavel Emelyanov <xemul@virtuozzo.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/shmem.c | 82 ++++++++++++++++++++++++++++--------------------------
+ 1 file changed, 42 insertions(+), 40 deletions(-)
+
+diff --git a/mm/shmem.c b/mm/shmem.c
+index b26f11221ea8..e30ffaa065a4 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -181,6 +181,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
+               vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
+ }
+ 
++static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
++{
++      struct shmem_inode_info *info = SHMEM_I(inode);
++      struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
++
++      if (shmem_acct_block(info->flags, pages))
++              return false;
++
++      if (sbinfo->max_blocks) {
++              if (percpu_counter_compare(&sbinfo->used_blocks,
++                                         sbinfo->max_blocks - pages) > 0)
++                      goto unacct;
++              percpu_counter_add(&sbinfo->used_blocks, pages);
++      }
++
++      return true;
++
++unacct:
++      shmem_unacct_blocks(info->flags, pages);
++      return false;
++}
++
++static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
++{
++      struct shmem_inode_info *info = SHMEM_I(inode);
++      struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
++
++      if (sbinfo->max_blocks)
++              percpu_counter_sub(&sbinfo->used_blocks, pages);
++      shmem_unacct_blocks(info->flags, pages);
++}
++
+ static const struct super_operations shmem_ops;
+ static const struct address_space_operations shmem_aops;
+ static const struct file_operations shmem_file_operations;
+@@ -237,31 +269,20 @@ static void shmem_recalc_inode(struct inode *inode)
+ 
+       freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
+       if (freed > 0) {
+-              struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+-              if (sbinfo->max_blocks)
+-                      percpu_counter_add(&sbinfo->used_blocks, -freed);
+               info->alloced -= freed;
+               inode->i_blocks -= freed * BLOCKS_PER_PAGE;
+-              shmem_unacct_blocks(info->flags, freed);
++              shmem_inode_unacct_blocks(inode, freed);
+       }
+ }
+ 
+ bool shmem_charge(struct inode *inode, long pages)
+ {
+       struct shmem_inode_info *info = SHMEM_I(inode);
+-      struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+       unsigned long flags;
+ 
+-      if (shmem_acct_block(info->flags, pages))
++      if (!shmem_inode_acct_block(inode, pages))
+               return false;
+ 
+-      if (sbinfo->max_blocks) {
+-              if (percpu_counter_compare(&sbinfo->used_blocks,
+-                                         sbinfo->max_blocks - pages) > 0)
+-                      goto unacct;
+-              percpu_counter_add(&sbinfo->used_blocks, pages);
+-      }
+-
+       spin_lock_irqsave(&info->lock, flags);
+       info->alloced += pages;
+       inode->i_blocks += pages * BLOCKS_PER_PAGE;
+@@ -270,16 +291,11 @@ bool shmem_charge(struct inode *inode, long pages)
+       inode->i_mapping->nrpages += pages;
+ 
+       return true;
+-
+-unacct:
+-      shmem_unacct_blocks(info->flags, pages);
+-      return false;
+ }
+ 
+ void shmem_uncharge(struct inode *inode, long pages)
+ {
+       struct shmem_inode_info *info = SHMEM_I(inode);
+-      struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&info->lock, flags);
+@@ -288,9 +304,7 @@ void shmem_uncharge(struct inode *inode, long pages)
+       shmem_recalc_inode(inode);
+       spin_unlock_irqrestore(&info->lock, flags);
+ 
+-      if (sbinfo->max_blocks)
+-              percpu_counter_sub(&sbinfo->used_blocks, pages);
+-      shmem_unacct_blocks(info->flags, pages);
++      shmem_inode_unacct_blocks(inode, pages);
+ }
+ 
+ /*
+@@ -1423,9 +1437,10 @@ static struct page *shmem_alloc_page(gfp_t gfp,
+ }
+ 
+ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
+-              struct shmem_inode_info *info, struct shmem_sb_info *sbinfo,
++              struct inode *inode,
+               pgoff_t index, bool huge)
+ {
++      struct shmem_inode_info *info = SHMEM_I(inode);
+       struct page *page;
+       int nr;
+       int err = -ENOSPC;
+@@ -1434,14 +1449,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
+               huge = false;
+       nr = huge ? HPAGE_PMD_NR : 1;
+ 
+-      if (shmem_acct_block(info->flags, nr))
++      if (!shmem_inode_acct_block(inode, nr))
+               goto failed;
+-      if (sbinfo->max_blocks) {
+-              if (percpu_counter_compare(&sbinfo->used_blocks,
+-                                      sbinfo->max_blocks - nr) > 0)
+-                      goto unacct;
+-              percpu_counter_add(&sbinfo->used_blocks, nr);
+-      }
+ 
+       if (huge)
+               page = shmem_alloc_hugepage(gfp, info, index);
+@@ -1454,10 +1463,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
+       }
+ 
+       err = -ENOMEM;
+-      if (sbinfo->max_blocks)
+-              percpu_counter_add(&sbinfo->used_blocks, -nr);
+-unacct:
+-      shmem_unacct_blocks(info->flags, nr);
++      shmem_inode_unacct_blocks(inode, nr);
+ failed:
+       return ERR_PTR(err);
+ }
+@@ -1717,10 +1723,9 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+               }
+ 
+ alloc_huge:
+-              page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+-                              index, true);
++              page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+               if (IS_ERR(page)) {
+-alloc_nohuge:         page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
++alloc_nohuge:         page = shmem_alloc_and_acct_page(gfp, inode,
+                                       index, false);
+               }
+               if (IS_ERR(page)) {
+@@ -1842,10 +1847,7 @@ alloc_nohuge:           page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+        * Error recovery.
+        */
+ unacct:
+-      if (sbinfo->max_blocks)
+-              percpu_counter_sub(&sbinfo->used_blocks,
+-                              1 << compound_order(page));
+-      shmem_unacct_blocks(info->flags, 1 << compound_order(page));
++      shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
+ 
+       if (PageTransHuge(page)) {
+               unlock_page(page);
+-- 
+2.17.1
+
diff --git a/queue-4.9/shmem-shmem_charge-verify-max_block-is-not-exceeded-.patch b/queue-4.9/shmem-shmem_charge-verify-max_block-is-not-exceeded-.patch

new file mode 100644 (file)

index 0000000..a50cbeb
--- /dev/null
+++ b/queue-4.9/shmem-shmem_charge-verify-max_block-is-not-exceeded-.patch
@@ -0,0 +1,83 @@
+From e572a0c101848b718dec80b7061ca221ed1e8308 Mon Sep 17 00:00:00 2001
+From: Mike Rapoport <rppt@linux.vnet.ibm.com>
+Date: Wed, 6 Sep 2017 16:22:56 -0700
+Subject: shmem: shmem_charge: verify max_block is not exceeded before inode
+ update
+
+commit b1cc94ab2f2ba31fcb2c59df0b9cf03f6d720553 upstream.
+
+Patch series "userfaultfd: enable zeropage support for shmem".
+
+These patches enable support for UFFDIO_ZEROPAGE for shared memory.
+
+The first two patches are not strictly related to userfaultfd, they are
+just minor refactoring to reduce amount of code duplication.
+
+This patch (of 7):
+
+Currently we update inode and shmem_inode_info before verifying that
+used_blocks will not exceed max_blocks.  In case it will, we undo the
+update.  Let's switch the order and move the verification of the blocks
+count before the inode and shmem_inode_info update.
+
+Link: http://lkml.kernel.org/r/1497939652-16528-2-git-send-email-rppt@linux.vnet.ibm.com
+Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Pavel Emelyanov <xemul@virtuozzo.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/shmem.c | 25 ++++++++++++-------------
+ 1 file changed, 12 insertions(+), 13 deletions(-)
+
+diff --git a/mm/shmem.c b/mm/shmem.c
+index 358a92be43eb..b26f11221ea8 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -254,6 +254,14 @@ bool shmem_charge(struct inode *inode, long pages)
+ 
+       if (shmem_acct_block(info->flags, pages))
+               return false;
++
++      if (sbinfo->max_blocks) {
++              if (percpu_counter_compare(&sbinfo->used_blocks,
++                                         sbinfo->max_blocks - pages) > 0)
++                      goto unacct;
++              percpu_counter_add(&sbinfo->used_blocks, pages);
++      }
++
+       spin_lock_irqsave(&info->lock, flags);
+       info->alloced += pages;
+       inode->i_blocks += pages * BLOCKS_PER_PAGE;
+@@ -261,20 +269,11 @@ bool shmem_charge(struct inode *inode, long pages)
+       spin_unlock_irqrestore(&info->lock, flags);
+       inode->i_mapping->nrpages += pages;
+ 
+-      if (!sbinfo->max_blocks)
+-              return true;
+-      if (percpu_counter_compare(&sbinfo->used_blocks,
+-                              sbinfo->max_blocks - pages) > 0) {
+-              inode->i_mapping->nrpages -= pages;
+-              spin_lock_irqsave(&info->lock, flags);
+-              info->alloced -= pages;
+-              shmem_recalc_inode(inode);
+-              spin_unlock_irqrestore(&info->lock, flags);
+-              shmem_unacct_blocks(info->flags, pages);
+-              return false;
+-      }
+-      percpu_counter_add(&sbinfo->used_blocks, pages);
+       return true;
++
++unacct:
++      shmem_unacct_blocks(info->flags, pages);
++      return false;
+ }
+ 
+ void shmem_uncharge(struct inode *inode, long pages)
+-- 
+2.17.1
+
author	Sasha Levin <sashal@kernel.org>
	Sun, 2 Dec 2018 08:31:57 +0000 (03:31 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Sun, 2 Dec 2018 08:31:57 +0000 (03:31 -0500)
queue-4.9/mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_siz.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-huge_memory-rename-freeze_page-to-unmap_page.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-huge_memory-splitting-set-mapping-index-before-un.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-huge_memory.c-reorder-operations-in-__split_huge_.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-khugepaged-collapse_shmem-do-not-crash-on-compoun.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-khugepaged-collapse_shmem-remember-to-clear-holes.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-khugepaged-collapse_shmem-stop-if-punched-or-trun.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-khugepaged-collapse_shmem-without-freezing-new_pa.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-khugepaged-fix-crashes-due-to-misaccounted-holes.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-khugepaged-minor-reorderings-in-collapse_shmem.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/series	[new file with mode: 0644]	patch \| blob
queue-4.9/shmem-introduce-shmem_inode_acct_block.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/shmem-shmem_charge-verify-max_block-is-not-exceeded-.patch	[new file with mode: 0644]	patch \| blob