From: Sasha Levin Date: Sun, 2 Dec 2018 08:31:57 +0000 (-0500) Subject: hugepage backports for 4.9 X-Git-Tag: v4.19.7~43 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=9358679d8731063805ff6609574433b09f1ef01d;p=thirdparty%2Fkernel%2Fstable-queue.git hugepage backports for 4.9 Signed-off-by: Sasha Levin --- diff --git a/queue-4.9/mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_siz.patch b/queue-4.9/mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_siz.patch new file mode 100644 index 00000000000..e61b76a7ea3 --- /dev/null +++ b/queue-4.9/mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_siz.patch @@ -0,0 +1,104 @@ +From 6a8f31910d7a4f229891283ae6858814deb55b89 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Fri, 30 Nov 2018 14:10:21 -0800 +Subject: mm/huge_memory: fix lockdep complaint on 32-bit i_size_read() + +commit 006d3ff27e884f80bd7d306b041afc415f63598f upstream. + +Huge tmpfs testing, on 32-bit kernel with lockdep enabled, showed that +__split_huge_page() was using i_size_read() while holding the irq-safe +lru_lock and page tree lock, but the 32-bit i_size_read() uses an +irq-unsafe seqlock which should not be nested inside them. + +Instead, read the i_size earlier in split_huge_page_to_list(), and pass +the end offset down to __split_huge_page(): all while holding head page +lock, which is enough to prevent truncation of that extent before the +page tree lock has been taken. + +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261520070.2275@eggly.anvils +Fixes: baa355fd33142 ("thp: file pages support for split_huge_page()") +Signed-off-by: Hugh Dickins +Acked-by: Kirill A. Shutemov +Cc: Jerome Glisse +Cc: Konstantin Khlebnikov +Cc: Matthew Wilcox +Cc: [4.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/huge_memory.c | 19 +++++++++++++------ + 1 file changed, 13 insertions(+), 6 deletions(-) + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 5beb62fa3d30..7ea8da990b9d 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -1925,12 +1925,11 @@ static void __split_huge_page_tail(struct page *head, int tail, + } + + static void __split_huge_page(struct page *page, struct list_head *list, +- unsigned long flags) ++ pgoff_t end, unsigned long flags) + { + struct page *head = compound_head(page); + struct zone *zone = page_zone(head); + struct lruvec *lruvec; +- pgoff_t end = -1; + int i; + + lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); +@@ -1938,9 +1937,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(head); + +- if (!PageAnon(page)) +- end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); +- + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { + __split_huge_page_tail(head, i, lruvec, list); + /* Some pages can be beyond i_size: drop them from page cache */ +@@ -2093,6 +2089,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + int count, mapcount, extra_pins, ret; + bool mlocked; + unsigned long flags; ++ pgoff_t end; + + VM_BUG_ON_PAGE(is_huge_zero_page(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); +@@ -2114,6 +2111,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + goto out; + } + extra_pins = 0; ++ end = -1; + mapping = NULL; + anon_vma_lock_write(anon_vma); + } else { +@@ -2129,6 +2127,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + extra_pins = HPAGE_PMD_NR; + anon_vma = NULL; + i_mmap_lock_read(mapping); ++ ++ /* ++ *__split_huge_page() may need to trim off pages beyond EOF: ++ * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, ++ * which cannot be nested inside the page tree lock. So note ++ * end now: i_size itself may be changed at any moment, but ++ * head page lock is good enough to serialize the trimming. ++ */ ++ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + } + + /* +@@ -2178,7 +2185,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + if (mapping) + __dec_node_page_state(page, NR_SHMEM_THPS); + spin_unlock(&pgdata->split_queue_lock); +- __split_huge_page(page, list, flags); ++ __split_huge_page(page, list, end, flags); + ret = 0; + } else { + if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { +-- +2.17.1 + diff --git a/queue-4.9/mm-huge_memory-rename-freeze_page-to-unmap_page.patch b/queue-4.9/mm-huge_memory-rename-freeze_page-to-unmap_page.patch new file mode 100644 index 00000000000..cd888e8c104 --- /dev/null +++ b/queue-4.9/mm-huge_memory-rename-freeze_page-to-unmap_page.patch @@ -0,0 +1,92 @@ +From 64fb3b48ff594d5eea8fd559819249b334e11c92 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Fri, 30 Nov 2018 14:10:13 -0800 +Subject: mm/huge_memory: rename freeze_page() to unmap_page() + +commit 906f9cdfc2a0800f13683f9e4ebdfd08c12ee81b upstream. + +The term "freeze" is used in several ways in the kernel, and in mm it +has the particular meaning of forcing page refcount temporarily to 0. +freeze_page() is just too confusing a name for a function that unmaps a +page: rename it unmap_page(), and rename unfreeze_page() remap_page(). + +Went to change the mention of freeze_page() added later in mm/rmap.c, +but found it to be incorrect: ordinary page reclaim reaches there too; +but the substance of the comment still seems correct, so edit it down. + +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261514080.2275@eggly.anvils +Fixes: e9b61f19858a5 ("thp: reintroduce split_huge_page()") +Signed-off-by: Hugh Dickins +Acked-by: Kirill A. Shutemov +Cc: Jerome Glisse +Cc: Konstantin Khlebnikov +Cc: Matthew Wilcox +Cc: [4.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/huge_memory.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 9f7bba700e4e..583ad61cc2f1 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -1839,7 +1839,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, + } + } + +-static void freeze_page(struct page *page) ++static void unmap_page(struct page *page) + { + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | + TTU_RMAP_LOCKED; +@@ -1862,7 +1862,7 @@ static void freeze_page(struct page *page) + VM_BUG_ON_PAGE(ret, page + i - 1); + } + +-static void unfreeze_page(struct page *page) ++static void remap_page(struct page *page) + { + int i; + +@@ -1971,7 +1971,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, + + spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); + +- unfreeze_page(head); ++ remap_page(head); + + for (i = 0; i < HPAGE_PMD_NR; i++) { + struct page *subpage = head + i; +@@ -2138,7 +2138,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + } + + /* +- * Racy check if we can split the page, before freeze_page() will ++ * Racy check if we can split the page, before unmap_page() will + * split PMDs + */ + if (total_mapcount(head) != page_count(head) - extra_pins - 1) { +@@ -2147,7 +2147,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + } + + mlocked = PageMlocked(page); +- freeze_page(head); ++ unmap_page(head); + VM_BUG_ON_PAGE(compound_mapcount(head), head); + + /* Make sure the page is not on per-CPU pagevec as it takes pin */ +@@ -2199,7 +2199,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + fail: if (mapping) + spin_unlock(&mapping->tree_lock); + spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); +- unfreeze_page(head); ++ remap_page(head); + ret = -EBUSY; + } + +-- +2.17.1 + diff --git a/queue-4.9/mm-huge_memory-splitting-set-mapping-index-before-un.patch b/queue-4.9/mm-huge_memory-splitting-set-mapping-index-before-un.patch new file mode 100644 index 00000000000..f44784cd24e --- /dev/null +++ b/queue-4.9/mm-huge_memory-splitting-set-mapping-index-before-un.patch @@ -0,0 +1,71 @@ +From a046d00b5ca0a24ae17eac1498d4f769bb166eb1 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Fri, 30 Nov 2018 14:10:16 -0800 +Subject: mm/huge_memory: splitting set mapping+index before unfreeze + +commit 173d9d9fd3ddae84c110fea8aedf1f26af6be9ec upstream. + +Huge tmpfs stress testing has occasionally hit shmem_undo_range()'s +VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page). + +Move the setting of mapping and index up before the page_ref_unfreeze() +in __split_huge_page_tail() to fix this: so that a page cache lookup +cannot get a reference while the tail's mapping and index are unstable. + +In fact, might as well move them up before the smp_wmb(): I don't see an +actual need for that, but if I'm missing something, this way round is +safer than the other, and no less efficient. + +You might argue that VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page) is +misplaced, and should be left until after the trylock_page(); but left as +is has not crashed since, and gives more stringent assurance. + +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261516380.2275@eggly.anvils +Fixes: e9b61f19858a5 ("thp: reintroduce split_huge_page()") +Requires: 605ca5ede764 ("mm/huge_memory.c: reorder operations in __split_huge_page_tail()") +Signed-off-by: Hugh Dickins +Acked-by: Kirill A. Shutemov +Cc: Konstantin Khlebnikov +Cc: Jerome Glisse +Cc: Matthew Wilcox +Cc: [4.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/huge_memory.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index c14aec110e90..5beb62fa3d30 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -1894,6 +1894,12 @@ static void __split_huge_page_tail(struct page *head, int tail, + (1L << PG_unevictable) | + (1L << PG_dirty))); + ++ /* ->mapping in first tail page is compound_mapcount */ ++ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, ++ page_tail); ++ page_tail->mapping = head->mapping; ++ page_tail->index = head->index + tail; ++ + /* Page flags must be visible before we make the page non-compound. */ + smp_wmb(); + +@@ -1914,12 +1920,6 @@ static void __split_huge_page_tail(struct page *head, int tail, + if (page_is_idle(head)) + set_page_idle(page_tail); + +- /* ->mapping in first tail page is compound_mapcount */ +- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, +- page_tail); +- page_tail->mapping = head->mapping; +- +- page_tail->index = head->index + tail; + page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + lru_add_page_tail(head, page_tail, lruvec, list); + } +-- +2.17.1 + diff --git a/queue-4.9/mm-huge_memory.c-reorder-operations-in-__split_huge_.patch b/queue-4.9/mm-huge_memory.c-reorder-operations-in-__split_huge_.patch new file mode 100644 index 00000000000..c4eb233ebfe --- /dev/null +++ b/queue-4.9/mm-huge_memory.c-reorder-operations-in-__split_huge_.patch @@ -0,0 +1,105 @@ +From 9a20eb64d247afb1bcb505380745460e4246e19b Mon Sep 17 00:00:00 2001 +From: Konstantin Khlebnikov +Date: Thu, 5 Apr 2018 16:23:28 -0700 +Subject: mm/huge_memory.c: reorder operations in __split_huge_page_tail() + +commit 605ca5ede7643a01f4c4a15913f9714ac297f8a6 upstream. + +THP split makes non-atomic change of tail page flags. This is almost ok +because tail pages are locked and isolated but this breaks recent +changes in page locking: non-atomic operation could clear bit +PG_waiters. + +As a result concurrent sequence get_page_unless_zero() -> lock_page() +might block forever. Especially if this page was truncated later. + +Fix is trivial: clone flags before unfreezing page reference counter. + +This race exists since commit 62906027091f ("mm: add PageWaiters +indicating tasks are waiting for a page bit") while unsave unfreeze +itself was added in commit 8df651c7059e ("thp: cleanup +split_huge_page()"). + +clear_compound_head() also must be called before unfreezing page +reference because after successful get_page_unless_zero() might follow +put_page() which needs correct compound_head(). + +And replace page_ref_inc()/page_ref_add() with page_ref_unfreeze() which +is made especially for that and has semantic of smp_store_release(). + +Link: http://lkml.kernel.org/r/151844393341.210639.13162088407980624477.stgit@buzz +Signed-off-by: Konstantin Khlebnikov +Acked-by: Kirill A. Shutemov +Cc: Michal Hocko +Cc: Nicholas Piggin +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/huge_memory.c | 36 +++++++++++++++--------------------- + 1 file changed, 15 insertions(+), 21 deletions(-) + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 583ad61cc2f1..c14aec110e90 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -1876,26 +1876,13 @@ static void __split_huge_page_tail(struct page *head, int tail, + struct page *page_tail = head + tail; + + VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); +- VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); + + /* +- * tail_page->_refcount is zero and not changing from under us. But +- * get_page_unless_zero() may be running from under us on the +- * tail_page. If we used atomic_set() below instead of atomic_inc() or +- * atomic_add(), we would then run atomic_set() concurrently with +- * get_page_unless_zero(), and atomic_set() is implemented in C not +- * using locked ops. spin_unlock on x86 sometime uses locked ops +- * because of PPro errata 66, 92, so unless somebody can guarantee +- * atomic_set() here would be safe on all archs (and not only on x86), +- * it's safer to use atomic_inc()/atomic_add(). ++ * Clone page flags before unfreezing refcount. ++ * ++ * After successful get_page_unless_zero() might follow flags change, ++ * for exmaple lock_page() which set PG_waiters. + */ +- if (PageAnon(head)) { +- page_ref_inc(page_tail); +- } else { +- /* Additional pin to radix tree */ +- page_ref_add(page_tail, 2); +- } +- + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (head->flags & + ((1L << PG_referenced) | +@@ -1907,14 +1894,21 @@ static void __split_huge_page_tail(struct page *head, int tail, + (1L << PG_unevictable) | + (1L << PG_dirty))); + +- /* +- * After clearing PageTail the gup refcount can be released. +- * Page flags also must be visible before we make the page non-compound. +- */ ++ /* Page flags must be visible before we make the page non-compound. */ + smp_wmb(); + ++ /* ++ * Clear PageTail before unfreezing page refcount. ++ * ++ * After successful get_page_unless_zero() might follow put_page() ++ * which needs correct compound_head(). ++ */ + clear_compound_head(page_tail); + ++ /* Finally unfreeze refcount. Additional reference from page cache. */ ++ page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || ++ PageSwapCache(head))); ++ + if (page_is_young(head)) + set_page_young(page_tail); + if (page_is_idle(head)) +-- +2.17.1 + diff --git a/queue-4.9/mm-khugepaged-collapse_shmem-do-not-crash-on-compoun.patch b/queue-4.9/mm-khugepaged-collapse_shmem-do-not-crash-on-compoun.patch new file mode 100644 index 00000000000..b4e34f836ed --- /dev/null +++ b/queue-4.9/mm-khugepaged-collapse_shmem-do-not-crash-on-compoun.patch @@ -0,0 +1,53 @@ +From 7c5584653f9c189445f428398fed77081888049d Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Fri, 30 Nov 2018 14:10:47 -0800 +Subject: mm/khugepaged: collapse_shmem() do not crash on Compound + +commit 06a5e1268a5fb9c2b346a3da6b97e85f2eba0f07 upstream. + +collapse_shmem()'s VM_BUG_ON_PAGE(PageTransCompound) was unsafe: before +it holds page lock of the first page, racing truncation then extension +might conceivably have inserted a hugepage there already. Fail with the +SCAN_PAGE_COMPOUND result, instead of crashing (CONFIG_DEBUG_VM=y) or +otherwise mishandling the unexpected hugepage - though later we might +code up a more constructive way of handling it, with SCAN_SUCCESS. + +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261529310.2275@eggly.anvils +Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") +Signed-off-by: Hugh Dickins +Cc: Kirill A. Shutemov +Cc: Jerome Glisse +Cc: Konstantin Khlebnikov +Cc: Matthew Wilcox +Cc: [4.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/khugepaged.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index b87bd43993bd..e0cfc3a54b6a 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1400,7 +1400,15 @@ static void collapse_shmem(struct mm_struct *mm, + */ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageUptodate(page), page); +- VM_BUG_ON_PAGE(PageTransCompound(page), page); ++ ++ /* ++ * If file was truncated then extended, or hole-punched, before ++ * we locked the first page, then a THP might be there already. ++ */ ++ if (PageTransCompound(page)) { ++ result = SCAN_PAGE_COMPOUND; ++ goto out_unlock; ++ } + + if (page_mapping(page) != mapping) { + result = SCAN_TRUNCATED; +-- +2.17.1 + diff --git a/queue-4.9/mm-khugepaged-collapse_shmem-remember-to-clear-holes.patch b/queue-4.9/mm-khugepaged-collapse_shmem-remember-to-clear-holes.patch new file mode 100644 index 00000000000..b2afc3691d8 --- /dev/null +++ b/queue-4.9/mm-khugepaged-collapse_shmem-remember-to-clear-holes.patch @@ -0,0 +1,64 @@ +From f3954f5504d74b093c531e6c3f6f14ceef61ca57 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Fri, 30 Nov 2018 14:10:35 -0800 +Subject: mm/khugepaged: collapse_shmem() remember to clear holes + +commit 2af8ff291848cc4b1cce24b6c943394eb2c761e8 upstream. + +Huge tmpfs testing reminds us that there is no __GFP_ZERO in the gfp +flags khugepaged uses to allocate a huge page - in all common cases it +would just be a waste of effort - so collapse_shmem() must remember to +clear out any holes that it instantiates. + +The obvious place to do so, where they are put into the page cache tree, +is not a good choice: because interrupts are disabled there. Leave it +until further down, once success is assured, where the other pages are +copied (before setting PageUptodate). + +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261525080.2275@eggly.anvils +Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") +Signed-off-by: Hugh Dickins +Acked-by: Kirill A. Shutemov +Cc: Jerome Glisse +Cc: Konstantin Khlebnikov +Cc: Matthew Wilcox +Cc: [4.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/khugepaged.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index 3f7bfd98b0e6..2d3ce49f6b45 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1501,7 +1501,12 @@ static void collapse_shmem(struct mm_struct *mm, + * Replacing old pages with new one has succeed, now we need to + * copy the content and free old pages. + */ ++ index = start; + list_for_each_entry_safe(page, tmp, &pagelist, lru) { ++ while (index < page->index) { ++ clear_highpage(new_page + (index % HPAGE_PMD_NR)); ++ index++; ++ } + copy_highpage(new_page + (page->index % HPAGE_PMD_NR), + page); + list_del(&page->lru); +@@ -1511,6 +1516,11 @@ static void collapse_shmem(struct mm_struct *mm, + ClearPageActive(page); + ClearPageUnevictable(page); + put_page(page); ++ index++; ++ } ++ while (index < end) { ++ clear_highpage(new_page + (index % HPAGE_PMD_NR)); ++ index++; + } + + local_irq_save(flags); +-- +2.17.1 + diff --git a/queue-4.9/mm-khugepaged-collapse_shmem-stop-if-punched-or-trun.patch b/queue-4.9/mm-khugepaged-collapse_shmem-stop-if-punched-or-trun.patch new file mode 100644 index 00000000000..c8863fc2546 --- /dev/null +++ b/queue-4.9/mm-khugepaged-collapse_shmem-stop-if-punched-or-trun.patch @@ -0,0 +1,62 @@ +From e42882c7909cc73b15473155ce9e1393eb367845 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Fri, 30 Nov 2018 14:10:25 -0800 +Subject: mm/khugepaged: collapse_shmem() stop if punched or truncated + +commit 701270fa193aadf00bdcf607738f64997275d4c7 upstream. + +Huge tmpfs testing showed that although collapse_shmem() recognizes a +concurrently truncated or hole-punched page correctly, its handling of +holes was liable to refill an emptied extent. Add check to stop that. + +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261522040.2275@eggly.anvils +Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") +Signed-off-by: Hugh Dickins +Reviewed-by: Matthew Wilcox +Cc: Kirill A. Shutemov +Cc: Jerome Glisse +Cc: Konstantin Khlebnikov +Cc: [4.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/khugepaged.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index 1df37ee996d5..62de24194f24 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1351,6 +1351,16 @@ static void collapse_shmem(struct mm_struct *mm, + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + int n = min(iter.index, end) - index; + ++ /* ++ * Stop if extent has been hole-punched, and is now completely ++ * empty (the more obvious i_size_read() check would take an ++ * irq-unsafe seqlock on 32-bit). ++ */ ++ if (n >= HPAGE_PMD_NR) { ++ result = SCAN_TRUNCATED; ++ goto tree_locked; ++ } ++ + /* + * Handle holes in the radix tree: charge it from shmem and + * insert relevant subpage of new_page into the radix-tree. +@@ -1462,6 +1472,11 @@ static void collapse_shmem(struct mm_struct *mm, + if (result == SCAN_SUCCEED && index < end) { + int n = end - index; + ++ /* Stop if extent has been truncated, and is now empty */ ++ if (n >= HPAGE_PMD_NR) { ++ result = SCAN_TRUNCATED; ++ goto tree_locked; ++ } + if (!shmem_charge(mapping->host, n)) { + result = SCAN_FAIL; + goto tree_locked; +-- +2.17.1 + diff --git a/queue-4.9/mm-khugepaged-collapse_shmem-without-freezing-new_pa.patch b/queue-4.9/mm-khugepaged-collapse_shmem-without-freezing-new_pa.patch new file mode 100644 index 00000000000..eb91f449b9e --- /dev/null +++ b/queue-4.9/mm-khugepaged-collapse_shmem-without-freezing-new_pa.patch @@ -0,0 +1,117 @@ +From 7736d26bcb16e54f444a8ee3f562b4ad43dd88ac Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Fri, 30 Nov 2018 14:10:43 -0800 +Subject: mm/khugepaged: collapse_shmem() without freezing new_page + +commit 87c460a0bded56195b5eb497d44709777ef7b415 upstream. + +khugepaged's collapse_shmem() does almost all of its work, to assemble +the huge new_page from 512 scattered old pages, with the new_page's +refcount frozen to 0 (and refcounts of all old pages so far also frozen +to 0). Including shmem_getpage() to read in any which were out on swap, +memory reclaim if necessary to allocate their intermediate pages, and +copying over all the data from old to new. + +Imagine the frozen refcount as a spinlock held, but without any lock +debugging to highlight the abuse: it's not good, and under serious load +heads into lockups - speculative getters of the page are not expecting +to spin while khugepaged is rescheduled. + +One can get a little further under load by hacking around elsewhere; but +fortunately, freezing the new_page turns out to have been entirely +unnecessary, with no hacks needed elsewhere. + +The huge new_page lock is already held throughout, and guards all its +subpages as they are brought one by one into the page cache tree; and +anything reading the data in that page, without the lock, before it has +been marked PageUptodate, would already be in the wrong. So simply +eliminate the freezing of the new_page. + +Each of the old pages remains frozen with refcount 0 after it has been +replaced by a new_page subpage in the page cache tree, until they are +all unfrozen on success or failure: just as before. They could be +unfrozen sooner, but cause no problem once no longer visible to +find_get_entry(), filemap_map_pages() and other speculative lookups. + +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261527570.2275@eggly.anvils +Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") +Signed-off-by: Hugh Dickins +Acked-by: Kirill A. Shutemov +Cc: Jerome Glisse +Cc: Konstantin Khlebnikov +Cc: Matthew Wilcox +Cc: [4.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/khugepaged.c | 19 +++++++------------ + 1 file changed, 7 insertions(+), 12 deletions(-) + +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index 47b83030fc53..b87bd43993bd 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1286,7 +1286,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) + * collapse_shmem - collapse small tmpfs/shmem pages into huge one. + * + * Basic scheme is simple, details are more complex: +- * - allocate and freeze a new huge page; ++ * - allocate and lock a new huge page; + * - scan over radix tree replacing old pages the new one + * + swap in pages if necessary; + * + fill in gaps; +@@ -1294,11 +1294,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) + * - if replacing succeed: + * + copy data over; + * + free old pages; +- * + unfreeze huge page; ++ * + unlock huge page; + * - if replacing failed; + * + put all pages back and unfreeze them; + * + restore gaps in the radix-tree; +- * + free huge page; ++ * + unlock and free huge page; + */ + static void collapse_shmem(struct mm_struct *mm, + struct address_space *mapping, pgoff_t start, +@@ -1336,13 +1336,11 @@ static void collapse_shmem(struct mm_struct *mm, + __SetPageSwapBacked(new_page); + new_page->index = start; + new_page->mapping = mapping; +- BUG_ON(!page_ref_freeze(new_page, 1)); + + /* +- * At this point the new_page is 'frozen' (page_count() is zero), locked +- * and not up-to-date. It's safe to insert it into radix tree, because +- * nobody would be able to map it or use it in other way until we +- * unfreeze it. ++ * At this point the new_page is locked and not up-to-date. ++ * It's safe to insert it into the page cache, because nobody would ++ * be able to map it or use it in another way until we unlock it. + */ + + index = start; +@@ -1520,9 +1518,8 @@ static void collapse_shmem(struct mm_struct *mm, + index++; + } + +- /* Everything is ready, let's unfreeze the new_page */ + SetPageUptodate(new_page); +- page_ref_unfreeze(new_page, HPAGE_PMD_NR); ++ page_ref_add(new_page, HPAGE_PMD_NR - 1); + set_page_dirty(new_page); + mem_cgroup_commit_charge(new_page, memcg, false, true); + lru_cache_add_anon(new_page); +@@ -1570,8 +1567,6 @@ static void collapse_shmem(struct mm_struct *mm, + VM_BUG_ON(nr_none); + spin_unlock_irq(&mapping->tree_lock); + +- /* Unfreeze new_page, caller would take care about freeing it */ +- page_ref_unfreeze(new_page, 1); + mem_cgroup_cancel_charge(new_page, memcg, true); + new_page->mapping = NULL; + } +-- +2.17.1 + diff --git a/queue-4.9/mm-khugepaged-fix-crashes-due-to-misaccounted-holes.patch b/queue-4.9/mm-khugepaged-fix-crashes-due-to-misaccounted-holes.patch new file mode 100644 index 00000000000..0bd26ccaea5 --- /dev/null +++ b/queue-4.9/mm-khugepaged-fix-crashes-due-to-misaccounted-holes.patch @@ -0,0 +1,92 @@ +From e4f8f8284389e7cf6b91adf3bed5ebe334ce7f10 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Fri, 30 Nov 2018 14:10:29 -0800 +Subject: mm/khugepaged: fix crashes due to misaccounted holes + +commit aaa52e340073b7f4593b3c4ddafcafa70cf838b5 upstream. + +Huge tmpfs testing on a shortish file mapped into a pmd-rounded extent +hit shmem_evict_inode()'s WARN_ON(inode->i_blocks) followed by +clear_inode()'s BUG_ON(inode->i_data.nrpages) when the file was later +closed and unlinked. + +khugepaged's collapse_shmem() was forgetting to update mapping->nrpages +on the rollback path, after it had added but then needs to undo some +holes. + +There is indeed an irritating asymmetry between shmem_charge(), whose +callers want it to increment nrpages after successfully accounting +blocks, and shmem_uncharge(), when __delete_from_page_cache() already +decremented nrpages itself: oh well, just add a comment on that to them +both. + +And shmem_recalc_inode() is supposed to be called when the accounting is +expected to be in balance (so it can deduce from imbalance that reclaim +discarded some pages): so change shmem_charge() to update nrpages +earlier (though it's rare for the difference to matter at all). + +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261523450.2275@eggly.anvils +Fixes: 800d8c63b2e98 ("shmem: add huge pages support") +Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") +Signed-off-by: Hugh Dickins +Acked-by: Kirill A. Shutemov +Cc: Jerome Glisse +Cc: Konstantin Khlebnikov +Cc: Matthew Wilcox +Cc: [4.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/khugepaged.c | 4 +++- + mm/shmem.c | 6 +++++- + 2 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index 62de24194f24..3f7bfd98b0e6 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1538,8 +1538,10 @@ static void collapse_shmem(struct mm_struct *mm, + *hpage = NULL; + } else { + /* Something went wrong: rollback changes to the radix-tree */ +- shmem_uncharge(mapping->host, nr_none); + spin_lock_irq(&mapping->tree_lock); ++ mapping->nrpages -= nr_none; ++ shmem_uncharge(mapping->host, nr_none); ++ + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, + start) { + if (iter.index >= end) +diff --git a/mm/shmem.c b/mm/shmem.c +index e30ffaa065a4..54911bbc74d6 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -283,12 +283,14 @@ bool shmem_charge(struct inode *inode, long pages) + if (!shmem_inode_acct_block(inode, pages)) + return false; + ++ /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ ++ inode->i_mapping->nrpages += pages; ++ + spin_lock_irqsave(&info->lock, flags); + info->alloced += pages; + inode->i_blocks += pages * BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); +- inode->i_mapping->nrpages += pages; + + return true; + } +@@ -298,6 +300,8 @@ void shmem_uncharge(struct inode *inode, long pages) + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long flags; + ++ /* nrpages adjustment done by __delete_from_page_cache() or caller */ ++ + spin_lock_irqsave(&info->lock, flags); + info->alloced -= pages; + inode->i_blocks -= pages * BLOCKS_PER_PAGE; +-- +2.17.1 + diff --git a/queue-4.9/mm-khugepaged-minor-reorderings-in-collapse_shmem.patch b/queue-4.9/mm-khugepaged-minor-reorderings-in-collapse_shmem.patch new file mode 100644 index 00000000000..e68389c8b8c --- /dev/null +++ b/queue-4.9/mm-khugepaged-minor-reorderings-in-collapse_shmem.patch @@ -0,0 +1,234 @@ +From b32ac19d2e6cfa7bb93ac3b9ef6172a0399e2c1d Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Fri, 30 Nov 2018 14:10:39 -0800 +Subject: mm/khugepaged: minor reorderings in collapse_shmem() + +commit 042a30824871fa3149b0127009074b75cc25863c upstream. + +Several cleanups in collapse_shmem(): most of which probably do not +really matter, beyond doing things in a more familiar and reassuring +order. Simplify the failure gotos in the main loop, and on success +update stats while interrupts still disabled from the last iteration. + +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261526400.2275@eggly.anvils +Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") +Signed-off-by: Hugh Dickins +Acked-by: Kirill A. Shutemov +Cc: Jerome Glisse +Cc: Konstantin Khlebnikov +Cc: Matthew Wilcox +Cc: [4.8+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/khugepaged.c | 73 ++++++++++++++++++++----------------------------- + 1 file changed, 30 insertions(+), 43 deletions(-) + +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index 2d3ce49f6b45..47b83030fc53 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1332,13 +1332,12 @@ static void collapse_shmem(struct mm_struct *mm, + goto out; + } + ++ __SetPageLocked(new_page); ++ __SetPageSwapBacked(new_page); + new_page->index = start; + new_page->mapping = mapping; +- __SetPageSwapBacked(new_page); +- __SetPageLocked(new_page); + BUG_ON(!page_ref_freeze(new_page, 1)); + +- + /* + * At this point the new_page is 'frozen' (page_count() is zero), locked + * and not up-to-date. It's safe to insert it into radix tree, because +@@ -1367,13 +1366,13 @@ static void collapse_shmem(struct mm_struct *mm, + */ + if (n && !shmem_charge(mapping->host, n)) { + result = SCAN_FAIL; +- break; ++ goto tree_locked; + } +- nr_none += n; + for (; index < min(iter.index, end); index++) { + radix_tree_insert(&mapping->page_tree, index, + new_page + (index % HPAGE_PMD_NR)); + } ++ nr_none += n; + + /* We are done. */ + if (index >= end) +@@ -1389,12 +1388,12 @@ static void collapse_shmem(struct mm_struct *mm, + result = SCAN_FAIL; + goto tree_unlocked; + } +- spin_lock_irq(&mapping->tree_lock); + } else if (trylock_page(page)) { + get_page(page); ++ spin_unlock_irq(&mapping->tree_lock); + } else { + result = SCAN_PAGE_LOCK; +- break; ++ goto tree_locked; + } + + /* +@@ -1409,11 +1408,10 @@ static void collapse_shmem(struct mm_struct *mm, + result = SCAN_TRUNCATED; + goto out_unlock; + } +- spin_unlock_irq(&mapping->tree_lock); + + if (isolate_lru_page(page)) { + result = SCAN_DEL_PAGE_LRU; +- goto out_isolate_failed; ++ goto out_unlock; + } + + if (page_mapped(page)) +@@ -1435,7 +1433,9 @@ static void collapse_shmem(struct mm_struct *mm, + */ + if (!page_ref_freeze(page, 3)) { + result = SCAN_PAGE_COUNT; +- goto out_lru; ++ spin_unlock_irq(&mapping->tree_lock); ++ putback_lru_page(page); ++ goto out_unlock; + } + + /* +@@ -1451,17 +1451,10 @@ static void collapse_shmem(struct mm_struct *mm, + slot = radix_tree_iter_next(&iter); + index++; + continue; +-out_lru: +- spin_unlock_irq(&mapping->tree_lock); +- putback_lru_page(page); +-out_isolate_failed: +- unlock_page(page); +- put_page(page); +- goto tree_unlocked; + out_unlock: + unlock_page(page); + put_page(page); +- break; ++ goto tree_unlocked; + } + + /* +@@ -1469,7 +1462,7 @@ static void collapse_shmem(struct mm_struct *mm, + * This code only triggers if there's nothing in radix tree + * beyond 'end'. + */ +- if (result == SCAN_SUCCEED && index < end) { ++ if (index < end) { + int n = end - index; + + /* Stop if extent has been truncated, and is now empty */ +@@ -1481,7 +1474,6 @@ static void collapse_shmem(struct mm_struct *mm, + result = SCAN_FAIL; + goto tree_locked; + } +- + for (; index < end; index++) { + radix_tree_insert(&mapping->page_tree, index, + new_page + (index % HPAGE_PMD_NR)); +@@ -1489,14 +1481,19 @@ static void collapse_shmem(struct mm_struct *mm, + nr_none += n; + } + ++ __inc_node_page_state(new_page, NR_SHMEM_THPS); ++ if (nr_none) { ++ struct zone *zone = page_zone(new_page); ++ ++ __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); ++ __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); ++ } ++ + tree_locked: + spin_unlock_irq(&mapping->tree_lock); + tree_unlocked: + + if (result == SCAN_SUCCEED) { +- unsigned long flags; +- struct zone *zone = page_zone(new_page); +- + /* + * Replacing old pages with new one has succeed, now we need to + * copy the content and free old pages. +@@ -1510,11 +1507,11 @@ static void collapse_shmem(struct mm_struct *mm, + copy_highpage(new_page + (page->index % HPAGE_PMD_NR), + page); + list_del(&page->lru); +- unlock_page(page); +- page_ref_unfreeze(page, 1); + page->mapping = NULL; ++ page_ref_unfreeze(page, 1); + ClearPageActive(page); + ClearPageUnevictable(page); ++ unlock_page(page); + put_page(page); + index++; + } +@@ -1523,28 +1520,17 @@ static void collapse_shmem(struct mm_struct *mm, + index++; + } + +- local_irq_save(flags); +- __inc_node_page_state(new_page, NR_SHMEM_THPS); +- if (nr_none) { +- __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); +- __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); +- } +- local_irq_restore(flags); +- +- /* +- * Remove pte page tables, so we can re-faulti +- * the page as huge. +- */ +- retract_page_tables(mapping, start); +- + /* Everything is ready, let's unfreeze the new_page */ +- set_page_dirty(new_page); + SetPageUptodate(new_page); + page_ref_unfreeze(new_page, HPAGE_PMD_NR); ++ set_page_dirty(new_page); + mem_cgroup_commit_charge(new_page, memcg, false, true); + lru_cache_add_anon(new_page); +- unlock_page(new_page); + ++ /* ++ * Remove pte page tables, so we can re-fault the page as huge. ++ */ ++ retract_page_tables(mapping, start); + *hpage = NULL; + } else { + /* Something went wrong: rollback changes to the radix-tree */ +@@ -1576,8 +1562,8 @@ static void collapse_shmem(struct mm_struct *mm, + page_ref_unfreeze(page, 2); + radix_tree_replace_slot(slot, page); + spin_unlock_irq(&mapping->tree_lock); +- putback_lru_page(page); + unlock_page(page); ++ putback_lru_page(page); + spin_lock_irq(&mapping->tree_lock); + slot = radix_tree_iter_next(&iter); + } +@@ -1587,9 +1573,10 @@ static void collapse_shmem(struct mm_struct *mm, + /* Unfreeze new_page, caller would take care about freeing it */ + page_ref_unfreeze(new_page, 1); + mem_cgroup_cancel_charge(new_page, memcg, true); +- unlock_page(new_page); + new_page->mapping = NULL; + } ++ ++ unlock_page(new_page); + out: + VM_BUG_ON(!list_empty(&pagelist)); + /* TODO: tracepoints */ +-- +2.17.1 + diff --git a/queue-4.9/series b/queue-4.9/series new file mode 100644 index 00000000000..d25bcd1edc8 --- /dev/null +++ b/queue-4.9/series @@ -0,0 +1,12 @@ +mm-huge_memory-rename-freeze_page-to-unmap_page.patch +mm-huge_memory.c-reorder-operations-in-__split_huge_.patch +mm-huge_memory-splitting-set-mapping-index-before-un.patch +mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_siz.patch +mm-khugepaged-collapse_shmem-stop-if-punched-or-trun.patch +shmem-shmem_charge-verify-max_block-is-not-exceeded-.patch +shmem-introduce-shmem_inode_acct_block.patch +mm-khugepaged-fix-crashes-due-to-misaccounted-holes.patch +mm-khugepaged-collapse_shmem-remember-to-clear-holes.patch +mm-khugepaged-minor-reorderings-in-collapse_shmem.patch +mm-khugepaged-collapse_shmem-without-freezing-new_pa.patch +mm-khugepaged-collapse_shmem-do-not-crash-on-compoun.patch diff --git a/queue-4.9/shmem-introduce-shmem_inode_acct_block.patch b/queue-4.9/shmem-introduce-shmem_inode_acct_block.patch new file mode 100644 index 00000000000..36c14443cef --- /dev/null +++ b/queue-4.9/shmem-introduce-shmem_inode_acct_block.patch @@ -0,0 +1,198 @@ +From d29cd29cf934dea66d08c49f7a055abfa7d27257 Mon Sep 17 00:00:00 2001 +From: Mike Rapoport +Date: Wed, 6 Sep 2017 16:22:59 -0700 +Subject: shmem: introduce shmem_inode_acct_block + +commit 0f0796945614b7523987f7eea32407421af4b1ee upstream. + +The shmem_acct_block and the update of used_blocks are following one +another in all the places they are used. Combine these two into a +helper function. + +Link: http://lkml.kernel.org/r/1497939652-16528-3-git-send-email-rppt@linux.vnet.ibm.com +Signed-off-by: Mike Rapoport +Cc: "Kirill A. Shutemov" +Cc: Andrea Arcangeli +Cc: Hillf Danton +Cc: Hugh Dickins +Cc: Pavel Emelyanov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/shmem.c | 82 ++++++++++++++++++++++++++++-------------------------- + 1 file changed, 42 insertions(+), 40 deletions(-) + +diff --git a/mm/shmem.c b/mm/shmem.c +index b26f11221ea8..e30ffaa065a4 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -181,6 +181,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) + vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); + } + ++static inline bool shmem_inode_acct_block(struct inode *inode, long pages) ++{ ++ struct shmem_inode_info *info = SHMEM_I(inode); ++ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); ++ ++ if (shmem_acct_block(info->flags, pages)) ++ return false; ++ ++ if (sbinfo->max_blocks) { ++ if (percpu_counter_compare(&sbinfo->used_blocks, ++ sbinfo->max_blocks - pages) > 0) ++ goto unacct; ++ percpu_counter_add(&sbinfo->used_blocks, pages); ++ } ++ ++ return true; ++ ++unacct: ++ shmem_unacct_blocks(info->flags, pages); ++ return false; ++} ++ ++static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) ++{ ++ struct shmem_inode_info *info = SHMEM_I(inode); ++ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); ++ ++ if (sbinfo->max_blocks) ++ percpu_counter_sub(&sbinfo->used_blocks, pages); ++ shmem_unacct_blocks(info->flags, pages); ++} ++ + static const struct super_operations shmem_ops; + static const struct address_space_operations shmem_aops; + static const struct file_operations shmem_file_operations; +@@ -237,31 +269,20 @@ static void shmem_recalc_inode(struct inode *inode) + + freed = info->alloced - info->swapped - inode->i_mapping->nrpages; + if (freed > 0) { +- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); +- if (sbinfo->max_blocks) +- percpu_counter_add(&sbinfo->used_blocks, -freed); + info->alloced -= freed; + inode->i_blocks -= freed * BLOCKS_PER_PAGE; +- shmem_unacct_blocks(info->flags, freed); ++ shmem_inode_unacct_blocks(inode, freed); + } + } + + bool shmem_charge(struct inode *inode, long pages) + { + struct shmem_inode_info *info = SHMEM_I(inode); +- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + unsigned long flags; + +- if (shmem_acct_block(info->flags, pages)) ++ if (!shmem_inode_acct_block(inode, pages)) + return false; + +- if (sbinfo->max_blocks) { +- if (percpu_counter_compare(&sbinfo->used_blocks, +- sbinfo->max_blocks - pages) > 0) +- goto unacct; +- percpu_counter_add(&sbinfo->used_blocks, pages); +- } +- + spin_lock_irqsave(&info->lock, flags); + info->alloced += pages; + inode->i_blocks += pages * BLOCKS_PER_PAGE; +@@ -270,16 +291,11 @@ bool shmem_charge(struct inode *inode, long pages) + inode->i_mapping->nrpages += pages; + + return true; +- +-unacct: +- shmem_unacct_blocks(info->flags, pages); +- return false; + } + + void shmem_uncharge(struct inode *inode, long pages) + { + struct shmem_inode_info *info = SHMEM_I(inode); +- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + unsigned long flags; + + spin_lock_irqsave(&info->lock, flags); +@@ -288,9 +304,7 @@ void shmem_uncharge(struct inode *inode, long pages) + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); + +- if (sbinfo->max_blocks) +- percpu_counter_sub(&sbinfo->used_blocks, pages); +- shmem_unacct_blocks(info->flags, pages); ++ shmem_inode_unacct_blocks(inode, pages); + } + + /* +@@ -1423,9 +1437,10 @@ static struct page *shmem_alloc_page(gfp_t gfp, + } + + static struct page *shmem_alloc_and_acct_page(gfp_t gfp, +- struct shmem_inode_info *info, struct shmem_sb_info *sbinfo, ++ struct inode *inode, + pgoff_t index, bool huge) + { ++ struct shmem_inode_info *info = SHMEM_I(inode); + struct page *page; + int nr; + int err = -ENOSPC; +@@ -1434,14 +1449,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, + huge = false; + nr = huge ? HPAGE_PMD_NR : 1; + +- if (shmem_acct_block(info->flags, nr)) ++ if (!shmem_inode_acct_block(inode, nr)) + goto failed; +- if (sbinfo->max_blocks) { +- if (percpu_counter_compare(&sbinfo->used_blocks, +- sbinfo->max_blocks - nr) > 0) +- goto unacct; +- percpu_counter_add(&sbinfo->used_blocks, nr); +- } + + if (huge) + page = shmem_alloc_hugepage(gfp, info, index); +@@ -1454,10 +1463,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, + } + + err = -ENOMEM; +- if (sbinfo->max_blocks) +- percpu_counter_add(&sbinfo->used_blocks, -nr); +-unacct: +- shmem_unacct_blocks(info->flags, nr); ++ shmem_inode_unacct_blocks(inode, nr); + failed: + return ERR_PTR(err); + } +@@ -1717,10 +1723,9 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + } + + alloc_huge: +- page = shmem_alloc_and_acct_page(gfp, info, sbinfo, +- index, true); ++ page = shmem_alloc_and_acct_page(gfp, inode, index, true); + if (IS_ERR(page)) { +-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, ++alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, + index, false); + } + if (IS_ERR(page)) { +@@ -1842,10 +1847,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, + * Error recovery. + */ + unacct: +- if (sbinfo->max_blocks) +- percpu_counter_sub(&sbinfo->used_blocks, +- 1 << compound_order(page)); +- shmem_unacct_blocks(info->flags, 1 << compound_order(page)); ++ shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); + + if (PageTransHuge(page)) { + unlock_page(page); +-- +2.17.1 + diff --git a/queue-4.9/shmem-shmem_charge-verify-max_block-is-not-exceeded-.patch b/queue-4.9/shmem-shmem_charge-verify-max_block-is-not-exceeded-.patch new file mode 100644 index 00000000000..a50cbeb3118 --- /dev/null +++ b/queue-4.9/shmem-shmem_charge-verify-max_block-is-not-exceeded-.patch @@ -0,0 +1,83 @@ +From e572a0c101848b718dec80b7061ca221ed1e8308 Mon Sep 17 00:00:00 2001 +From: Mike Rapoport +Date: Wed, 6 Sep 2017 16:22:56 -0700 +Subject: shmem: shmem_charge: verify max_block is not exceeded before inode + update + +commit b1cc94ab2f2ba31fcb2c59df0b9cf03f6d720553 upstream. + +Patch series "userfaultfd: enable zeropage support for shmem". + +These patches enable support for UFFDIO_ZEROPAGE for shared memory. + +The first two patches are not strictly related to userfaultfd, they are +just minor refactoring to reduce amount of code duplication. + +This patch (of 7): + +Currently we update inode and shmem_inode_info before verifying that +used_blocks will not exceed max_blocks. In case it will, we undo the +update. Let's switch the order and move the verification of the blocks +count before the inode and shmem_inode_info update. + +Link: http://lkml.kernel.org/r/1497939652-16528-2-git-send-email-rppt@linux.vnet.ibm.com +Signed-off-by: Mike Rapoport +Cc: Andrea Arcangeli +Cc: Hugh Dickins +Cc: "Kirill A. Shutemov" +Cc: Hillf Danton +Cc: Pavel Emelyanov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/shmem.c | 25 ++++++++++++------------- + 1 file changed, 12 insertions(+), 13 deletions(-) + +diff --git a/mm/shmem.c b/mm/shmem.c +index 358a92be43eb..b26f11221ea8 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -254,6 +254,14 @@ bool shmem_charge(struct inode *inode, long pages) + + if (shmem_acct_block(info->flags, pages)) + return false; ++ ++ if (sbinfo->max_blocks) { ++ if (percpu_counter_compare(&sbinfo->used_blocks, ++ sbinfo->max_blocks - pages) > 0) ++ goto unacct; ++ percpu_counter_add(&sbinfo->used_blocks, pages); ++ } ++ + spin_lock_irqsave(&info->lock, flags); + info->alloced += pages; + inode->i_blocks += pages * BLOCKS_PER_PAGE; +@@ -261,20 +269,11 @@ bool shmem_charge(struct inode *inode, long pages) + spin_unlock_irqrestore(&info->lock, flags); + inode->i_mapping->nrpages += pages; + +- if (!sbinfo->max_blocks) +- return true; +- if (percpu_counter_compare(&sbinfo->used_blocks, +- sbinfo->max_blocks - pages) > 0) { +- inode->i_mapping->nrpages -= pages; +- spin_lock_irqsave(&info->lock, flags); +- info->alloced -= pages; +- shmem_recalc_inode(inode); +- spin_unlock_irqrestore(&info->lock, flags); +- shmem_unacct_blocks(info->flags, pages); +- return false; +- } +- percpu_counter_add(&sbinfo->used_blocks, pages); + return true; ++ ++unacct: ++ shmem_unacct_blocks(info->flags, pages); ++ return false; + } + + void shmem_uncharge(struct inode *inode, long pages) +-- +2.17.1 +