From 7cc4d518216ca0673fbceb9dfee3125a218da5cb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 16 Nov 2020 17:26:16 +0100 Subject: [PATCH] 5.9-stable patches added patches: bootconfig-extend-the-magic-check-range-to-the-preceding-3-bytes.patch compiler.h-fix-barrier_data-on-clang.patch futex-don-t-enable-irqs-unconditionally-in-put_pi_state.patch hugetlbfs-fix-anon-huge-page-migration-race.patch jbd2-fix-up-sparse-warnings-in-checkpoint-code.patch mei-protect-mei_cl_mtu-from-null-dereference.patch mm-compaction-count-pages-and-stop-correctly-during-page-isolation.patch mm-compaction-stop-isolation-if-too-many-pages-are-isolated-and-we-have-pages-to-migrate.patch mm-gup-use-unpin_user_pages-in-__gup_longterm_locked.patch mm-slub-fix-panic-in-slab_alloc_node.patch mm-vmscan-fix-nr_isolated_file-corruption-on-64-bit.patch ocfs2-initialize-ip_next_orphan.patch reboot-fix-overflow-parsing-reboot-cpu-number.patch revert-kernel-reboot.c-convert-simple_strtoul-to-kstrtoint.patch virtio-virtio_console-fix-dma-memory-allocation-for-rproc-serial.patch xhci-hisilicon-fix-refercence-leak-in-xhci_histb_probe.patch --- ...check-range-to-the-preceding-3-bytes.patch | 55 +++ ...compiler.h-fix-barrier_data-on-clang.patch | 131 +++++++ ...irqs-unconditionally-in-put_pi_state.patch | 49 +++ ...fs-fix-anon-huge-page-migration-race.patch | 319 ++++++++++++++++++ ...p-sparse-warnings-in-checkpoint-code.patch | 50 +++ ...ect-mei_cl_mtu-from-null-dereference.patch | 41 +++ ...stop-correctly-during-page-isolation.patch | 86 +++++ ...solated-and-we-have-pages-to-migrate.patch | 53 +++ ..._user_pages-in-__gup_longterm_locked.patch | 66 ++++ ...mm-slub-fix-panic-in-slab_alloc_node.patch | 126 +++++++ ...r_isolated_file-corruption-on-64-bit.patch | 59 ++++ .../ocfs2-initialize-ip_next_orphan.patch | 93 +++++ ...x-overflow-parsing-reboot-cpu-number.patch | 74 ++++ ...-convert-simple_strtoul-to-kstrtoint.patch | 86 +++++ queue-5.9/series | 16 + ...a-memory-allocation-for-rproc-serial.patch | 83 +++++ ...-refercence-leak-in-xhci_histb_probe.patch | 40 +++ 17 files changed, 1427 insertions(+) create mode 100644 queue-5.9/bootconfig-extend-the-magic-check-range-to-the-preceding-3-bytes.patch create mode 100644 queue-5.9/compiler.h-fix-barrier_data-on-clang.patch create mode 100644 queue-5.9/futex-don-t-enable-irqs-unconditionally-in-put_pi_state.patch create mode 100644 queue-5.9/hugetlbfs-fix-anon-huge-page-migration-race.patch create mode 100644 queue-5.9/jbd2-fix-up-sparse-warnings-in-checkpoint-code.patch create mode 100644 queue-5.9/mei-protect-mei_cl_mtu-from-null-dereference.patch create mode 100644 queue-5.9/mm-compaction-count-pages-and-stop-correctly-during-page-isolation.patch create mode 100644 queue-5.9/mm-compaction-stop-isolation-if-too-many-pages-are-isolated-and-we-have-pages-to-migrate.patch create mode 100644 queue-5.9/mm-gup-use-unpin_user_pages-in-__gup_longterm_locked.patch create mode 100644 queue-5.9/mm-slub-fix-panic-in-slab_alloc_node.patch create mode 100644 queue-5.9/mm-vmscan-fix-nr_isolated_file-corruption-on-64-bit.patch create mode 100644 queue-5.9/ocfs2-initialize-ip_next_orphan.patch create mode 100644 queue-5.9/reboot-fix-overflow-parsing-reboot-cpu-number.patch create mode 100644 queue-5.9/revert-kernel-reboot.c-convert-simple_strtoul-to-kstrtoint.patch create mode 100644 queue-5.9/virtio-virtio_console-fix-dma-memory-allocation-for-rproc-serial.patch create mode 100644 queue-5.9/xhci-hisilicon-fix-refercence-leak-in-xhci_histb_probe.patch diff --git a/queue-5.9/bootconfig-extend-the-magic-check-range-to-the-preceding-3-bytes.patch b/queue-5.9/bootconfig-extend-the-magic-check-range-to-the-preceding-3-bytes.patch new file mode 100644 index 00000000000..246c25f7225 --- /dev/null +++ b/queue-5.9/bootconfig-extend-the-magic-check-range-to-the-preceding-3-bytes.patch @@ -0,0 +1,55 @@ +From 50b8a742850fce7293bed45753152c425f7e931b Mon Sep 17 00:00:00 2001 +From: Masami Hiramatsu +Date: Fri, 13 Nov 2020 02:27:31 +0900 +Subject: bootconfig: Extend the magic check range to the preceding 3 bytes + +From: Masami Hiramatsu + +commit 50b8a742850fce7293bed45753152c425f7e931b upstream. + +Since Grub may align the size of initrd to 4 if user pass +initrd from cpio, we have to check the preceding 3 bytes as well. + +Link: https://lkml.kernel.org/r/160520205132.303174.4876760192433315429.stgit@devnote2 + +Cc: stable@vger.kernel.org +Fixes: 85c46b78da58 ("bootconfig: Add bootconfig magic word for indicating bootconfig explicitly") +Reported-by: Chen Yu +Tested-by: Chen Yu +Signed-off-by: Masami Hiramatsu +Signed-off-by: Steven Rostedt (VMware) +Signed-off-by: Greg Kroah-Hartman + +--- + init/main.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +--- a/init/main.c ++++ b/init/main.c +@@ -267,14 +267,24 @@ static void * __init get_boot_config_fro + u32 size, csum; + char *data; + u32 *hdr; ++ int i; + + if (!initrd_end) + return NULL; + + data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN; +- if (memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN)) +- return NULL; ++ /* ++ * Since Grub may align the size of initrd to 4, we must ++ * check the preceding 3 bytes as well. ++ */ ++ for (i = 0; i < 4; i++) { ++ if (!memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN)) ++ goto found; ++ data--; ++ } ++ return NULL; + ++found: + hdr = (u32 *)(data - 8); + size = hdr[0]; + csum = hdr[1]; diff --git a/queue-5.9/compiler.h-fix-barrier_data-on-clang.patch b/queue-5.9/compiler.h-fix-barrier_data-on-clang.patch new file mode 100644 index 00000000000..1e477279fa4 --- /dev/null +++ b/queue-5.9/compiler.h-fix-barrier_data-on-clang.patch @@ -0,0 +1,131 @@ +From 3347acc6fcd4ee71ad18a9ff9d9dac176b517329 Mon Sep 17 00:00:00 2001 +From: Arvind Sankar +Date: Fri, 13 Nov 2020 22:51:59 -0800 +Subject: compiler.h: fix barrier_data() on clang + +From: Arvind Sankar + +commit 3347acc6fcd4ee71ad18a9ff9d9dac176b517329 upstream. + +Commit 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h +mutually exclusive") neglected to copy barrier_data() from +compiler-gcc.h into compiler-clang.h. + +The definition in compiler-gcc.h was really to work around clang's more +aggressive optimization, so this broke barrier_data() on clang, and +consequently memzero_explicit() as well. + +For example, this results in at least the memzero_explicit() call in +lib/crypto/sha256.c:sha256_transform() being optimized away by clang. + +Fix this by moving the definition of barrier_data() into compiler.h. + +Also move the gcc/clang definition of barrier() into compiler.h, +__memory_barrier() is icc-specific (and barrier() is already defined +using it in compiler-intel.h) and doesn't belong in compiler.h. + +[rdunlap@infradead.org: fix ALPHA builds when SMP is not enabled] + +Link: https://lkml.kernel.org/r/20201101231835.4589-1-rdunlap@infradead.org +Fixes: 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") +Signed-off-by: Arvind Sankar +Signed-off-by: Randy Dunlap +Signed-off-by: Andrew Morton +Tested-by: Nick Desaulniers +Reviewed-by: Nick Desaulniers +Reviewed-by: Kees Cook +Cc: +Link: https://lkml.kernel.org/r/20201014212631.207844-1-nivedita@alum.mit.edu +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/asm-generic/barrier.h | 1 + + include/linux/compiler-clang.h | 6 ------ + include/linux/compiler-gcc.h | 19 ------------------- + include/linux/compiler.h | 18 ++++++++++++++++-- + 4 files changed, 17 insertions(+), 27 deletions(-) + +--- a/include/asm-generic/barrier.h ++++ b/include/asm-generic/barrier.h +@@ -13,6 +13,7 @@ + + #ifndef __ASSEMBLY__ + ++#include + #include + + #ifndef nop +--- a/include/linux/compiler-clang.h ++++ b/include/linux/compiler-clang.h +@@ -52,12 +52,6 @@ + #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 + #endif + +-/* The following are for compatibility with GCC, from compiler-gcc.h, +- * and may be redefined here because they should not be shared with other +- * compilers, like ICC. +- */ +-#define barrier() __asm__ __volatile__("" : : : "memory") +- + #if __has_feature(shadow_call_stack) + # define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) + #endif +--- a/include/linux/compiler-gcc.h ++++ b/include/linux/compiler-gcc.h +@@ -15,25 +15,6 @@ + # error Sorry, your compiler is too old - please upgrade it. + #endif + +-/* Optimization barrier */ +- +-/* The "volatile" is due to gcc bugs */ +-#define barrier() __asm__ __volatile__("": : :"memory") +-/* +- * This version is i.e. to prevent dead stores elimination on @ptr +- * where gcc and llvm may behave differently when otherwise using +- * normal barrier(): while gcc behavior gets along with a normal +- * barrier(), llvm needs an explicit input variable to be assumed +- * clobbered. The issue is as follows: while the inline asm might +- * access any memory it wants, the compiler could have fit all of +- * @ptr into memory registers instead, and since @ptr never escaped +- * from that, it proved that the inline asm wasn't touching any of +- * it. This version works well with both compilers, i.e. we're telling +- * the compiler that the inline asm absolutely may see the contents +- * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 +- */ +-#define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory") +- + /* + * This macro obfuscates arithmetic on a variable address so that gcc + * shouldn't recognize the original var, and make assumptions about it. +--- a/include/linux/compiler.h ++++ b/include/linux/compiler.h +@@ -80,11 +80,25 @@ void ftrace_likely_update(struct ftrace_ + + /* Optimization barrier */ + #ifndef barrier +-# define barrier() __memory_barrier() ++/* The "volatile" is due to gcc bugs */ ++# define barrier() __asm__ __volatile__("": : :"memory") + #endif + + #ifndef barrier_data +-# define barrier_data(ptr) barrier() ++/* ++ * This version is i.e. to prevent dead stores elimination on @ptr ++ * where gcc and llvm may behave differently when otherwise using ++ * normal barrier(): while gcc behavior gets along with a normal ++ * barrier(), llvm needs an explicit input variable to be assumed ++ * clobbered. The issue is as follows: while the inline asm might ++ * access any memory it wants, the compiler could have fit all of ++ * @ptr into memory registers instead, and since @ptr never escaped ++ * from that, it proved that the inline asm wasn't touching any of ++ * it. This version works well with both compilers, i.e. we're telling ++ * the compiler that the inline asm absolutely may see the contents ++ * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 ++ */ ++# define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory") + #endif + + /* workaround for GCC PR82365 if needed */ diff --git a/queue-5.9/futex-don-t-enable-irqs-unconditionally-in-put_pi_state.patch b/queue-5.9/futex-don-t-enable-irqs-unconditionally-in-put_pi_state.patch new file mode 100644 index 00000000000..20d8798a86e --- /dev/null +++ b/queue-5.9/futex-don-t-enable-irqs-unconditionally-in-put_pi_state.patch @@ -0,0 +1,49 @@ +From 1e106aa3509b86738769775969822ffc1ec21bf4 Mon Sep 17 00:00:00 2001 +From: Dan Carpenter +Date: Fri, 6 Nov 2020 11:52:05 +0300 +Subject: futex: Don't enable IRQs unconditionally in put_pi_state() + +From: Dan Carpenter + +commit 1e106aa3509b86738769775969822ffc1ec21bf4 upstream. + +The exit_pi_state_list() function calls put_pi_state() with IRQs disabled +and is not expecting that IRQs will be enabled inside the function. + +Use the _irqsave() variant so that IRQs are restored to the original state +instead of being enabled unconditionally. + +Fixes: 153fbd1226fb ("futex: Fix more put_pi_state() vs. exit_pi_state_list() races") +Signed-off-by: Dan Carpenter +Signed-off-by: Thomas Gleixner +Acked-by: Peter Zijlstra (Intel) +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20201106085205.GA1159983@mwanda +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/futex.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -788,8 +788,9 @@ static void put_pi_state(struct futex_pi + */ + if (pi_state->owner) { + struct task_struct *owner; ++ unsigned long flags; + +- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); + owner = pi_state->owner; + if (owner) { + raw_spin_lock(&owner->pi_lock); +@@ -797,7 +798,7 @@ static void put_pi_state(struct futex_pi + raw_spin_unlock(&owner->pi_lock); + } + rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); +- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); + } + + if (current->pi_state_cache) { diff --git a/queue-5.9/hugetlbfs-fix-anon-huge-page-migration-race.patch b/queue-5.9/hugetlbfs-fix-anon-huge-page-migration-race.patch new file mode 100644 index 00000000000..b5fa07b04bc --- /dev/null +++ b/queue-5.9/hugetlbfs-fix-anon-huge-page-migration-race.patch @@ -0,0 +1,319 @@ +From 336bf30eb76580b579dc711ded5d599d905c0217 Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Fri, 13 Nov 2020 22:52:16 -0800 +Subject: hugetlbfs: fix anon huge page migration race + +From: Mike Kravetz + +commit 336bf30eb76580b579dc711ded5d599d905c0217 upstream. + +Qian Cai reported the following BUG in [1] + + LTP: starting move_pages12 + BUG: unable to handle page fault for address: ffffffffffffffe0 + ... + RIP: 0010:anon_vma_interval_tree_iter_first+0xa2/0x170 avc_start_pgoff at mm/interval_tree.c:63 + Call Trace: + rmap_walk_anon+0x141/0xa30 rmap_walk_anon at mm/rmap.c:1864 + try_to_unmap+0x209/0x2d0 try_to_unmap at mm/rmap.c:1763 + migrate_pages+0x1005/0x1fb0 + move_pages_and_store_status.isra.47+0xd7/0x1a0 + __x64_sys_move_pages+0xa5c/0x1100 + do_syscall_64+0x5f/0x310 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Hugh Dickins diagnosed this as a migration bug caused by code introduced +to use i_mmap_rwsem for pmd sharing synchronization. Specifically, the +routine unmap_and_move_huge_page() is always passing the TTU_RMAP_LOCKED +flag to try_to_unmap() while holding i_mmap_rwsem. This is wrong for +anon pages as the anon_vma_lock should be held in this case. Further +analysis suggested that i_mmap_rwsem was not required to he held at all +when calling try_to_unmap for anon pages as an anon page could never be +part of a shared pmd mapping. + +Discussion also revealed that the hack in hugetlb_page_mapping_lock_write +to drop page lock and acquire i_mmap_rwsem is wrong. There is no way to +keep mapping valid while dropping page lock. + +This patch does the following: + + - Do not take i_mmap_rwsem and set TTU_RMAP_LOCKED for anon pages when + calling try_to_unmap. + + - Remove the hacky code in hugetlb_page_mapping_lock_write. The routine + will now simply do a 'trylock' while still holding the page lock. If + the trylock fails, it will return NULL. This could impact the + callers: + + - migration calling code will receive -EAGAIN and retry up to the + hard coded limit (10). + + - memory error code will treat the page as BUSY. This will force + killing (SIGKILL) instead of SIGBUS any mapping tasks. + + Do note that this change in behavior only happens when there is a + race. None of the standard kernel testing suites actually hit this + race, but it is possible. + +[1] https://lore.kernel.org/lkml/20200708012044.GC992@lca.pw/ +[2] https://lore.kernel.org/linux-mm/alpine.LSU.2.11.2010071833100.2214@eggly.anvils/ + +Fixes: c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") +Reported-by: Qian Cai +Suggested-by: Hugh Dickins +Signed-off-by: Mike Kravetz +Signed-off-by: Andrew Morton +Acked-by: Naoya Horiguchi +Cc: +Link: https://lkml.kernel.org/r/20201105195058.78401-1-mike.kravetz@oracle.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/hugetlb.c | 90 ++-------------------------------------------------- + mm/memory-failure.c | 36 +++++++++----------- + mm/migrate.c | 46 ++++++++++++++------------ + mm/rmap.c | 5 -- + 4 files changed, 48 insertions(+), 129 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1579,103 +1579,23 @@ int PageHeadHuge(struct page *page_head) + } + + /* +- * Find address_space associated with hugetlbfs page. +- * Upon entry page is locked and page 'was' mapped although mapped state +- * could change. If necessary, use anon_vma to find vma and associated +- * address space. The returned mapping may be stale, but it can not be +- * invalid as page lock (which is held) is required to destroy mapping. +- */ +-static struct address_space *_get_hugetlb_page_mapping(struct page *hpage) +-{ +- struct anon_vma *anon_vma; +- pgoff_t pgoff_start, pgoff_end; +- struct anon_vma_chain *avc; +- struct address_space *mapping = page_mapping(hpage); +- +- /* Simple file based mapping */ +- if (mapping) +- return mapping; +- +- /* +- * Even anonymous hugetlbfs mappings are associated with an +- * underlying hugetlbfs file (see hugetlb_file_setup in mmap +- * code). Find a vma associated with the anonymous vma, and +- * use the file pointer to get address_space. +- */ +- anon_vma = page_lock_anon_vma_read(hpage); +- if (!anon_vma) +- return mapping; /* NULL */ +- +- /* Use first found vma */ +- pgoff_start = page_to_pgoff(hpage); +- pgoff_end = pgoff_start + pages_per_huge_page(page_hstate(hpage)) - 1; +- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, +- pgoff_start, pgoff_end) { +- struct vm_area_struct *vma = avc->vma; +- +- mapping = vma->vm_file->f_mapping; +- break; +- } +- +- anon_vma_unlock_read(anon_vma); +- return mapping; +-} +- +-/* + * Find and lock address space (mapping) in write mode. + * +- * Upon entry, the page is locked which allows us to find the mapping +- * even in the case of an anon page. However, locking order dictates +- * the i_mmap_rwsem be acquired BEFORE the page lock. This is hugetlbfs +- * specific. So, we first try to lock the sema while still holding the +- * page lock. If this works, great! If not, then we need to drop the +- * page lock and then acquire i_mmap_rwsem and reacquire page lock. Of +- * course, need to revalidate state along the way. ++ * Upon entry, the page is locked which means that page_mapping() is ++ * stable. Due to locking order, we can only trylock_write. If we can ++ * not get the lock, simply return NULL to caller. + */ + struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) + { +- struct address_space *mapping, *mapping2; ++ struct address_space *mapping = page_mapping(hpage); + +- mapping = _get_hugetlb_page_mapping(hpage); +-retry: + if (!mapping) + return mapping; + +- /* +- * If no contention, take lock and return +- */ + if (i_mmap_trylock_write(mapping)) + return mapping; + +- /* +- * Must drop page lock and wait on mapping sema. +- * Note: Once page lock is dropped, mapping could become invalid. +- * As a hack, increase map count until we lock page again. +- */ +- atomic_inc(&hpage->_mapcount); +- unlock_page(hpage); +- i_mmap_lock_write(mapping); +- lock_page(hpage); +- atomic_add_negative(-1, &hpage->_mapcount); +- +- /* verify page is still mapped */ +- if (!page_mapped(hpage)) { +- i_mmap_unlock_write(mapping); +- return NULL; +- } +- +- /* +- * Get address space again and verify it is the same one +- * we locked. If not, drop lock and retry. +- */ +- mapping2 = _get_hugetlb_page_mapping(hpage); +- if (mapping2 != mapping) { +- i_mmap_unlock_write(mapping); +- mapping = mapping2; +- goto retry; +- } +- +- return mapping; ++ return NULL; + } + + pgoff_t __basepage_index(struct page *page) +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1031,27 +1031,25 @@ static bool hwpoison_user_mappings(struc + if (!PageHuge(hpage)) { + unmap_success = try_to_unmap(hpage, ttu); + } else { +- /* +- * For hugetlb pages, try_to_unmap could potentially call +- * huge_pmd_unshare. Because of this, take semaphore in +- * write mode here and set TTU_RMAP_LOCKED to indicate we +- * have taken the lock at this higer level. +- * +- * Note that the call to hugetlb_page_mapping_lock_write +- * is necessary even if mapping is already set. It handles +- * ugliness of potentially having to drop page lock to obtain +- * i_mmap_rwsem. +- */ +- mapping = hugetlb_page_mapping_lock_write(hpage); +- +- if (mapping) { +- unmap_success = try_to_unmap(hpage, ++ if (!PageAnon(hpage)) { ++ /* ++ * For hugetlb pages in shared mappings, try_to_unmap ++ * could potentially call huge_pmd_unshare. Because of ++ * this, take semaphore in write mode here and set ++ * TTU_RMAP_LOCKED to indicate we have taken the lock ++ * at this higer level. ++ */ ++ mapping = hugetlb_page_mapping_lock_write(hpage); ++ if (mapping) { ++ unmap_success = try_to_unmap(hpage, + ttu|TTU_RMAP_LOCKED); +- i_mmap_unlock_write(mapping); ++ i_mmap_unlock_write(mapping); ++ } else { ++ pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); ++ unmap_success = false; ++ } + } else { +- pr_info("Memory failure: %#lx: could not find mapping for mapped huge page\n", +- pfn); +- unmap_success = false; ++ unmap_success = try_to_unmap(hpage, ttu); + } + } + if (!unmap_success) +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1333,34 +1333,38 @@ static int unmap_and_move_huge_page(new_ + goto put_anon; + + if (page_mapped(hpage)) { +- /* +- * try_to_unmap could potentially call huge_pmd_unshare. +- * Because of this, take semaphore in write mode here and +- * set TTU_RMAP_LOCKED to let lower levels know we have +- * taken the lock. +- */ +- mapping = hugetlb_page_mapping_lock_write(hpage); +- if (unlikely(!mapping)) +- goto unlock_put_anon; +- +- try_to_unmap(hpage, +- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| +- TTU_RMAP_LOCKED); ++ bool mapping_locked = false; ++ enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK| ++ TTU_IGNORE_ACCESS; ++ ++ if (!PageAnon(hpage)) { ++ /* ++ * In shared mappings, try_to_unmap could potentially ++ * call huge_pmd_unshare. Because of this, take ++ * semaphore in write mode here and set TTU_RMAP_LOCKED ++ * to let lower levels know we have taken the lock. ++ */ ++ mapping = hugetlb_page_mapping_lock_write(hpage); ++ if (unlikely(!mapping)) ++ goto unlock_put_anon; ++ ++ mapping_locked = true; ++ ttu |= TTU_RMAP_LOCKED; ++ } ++ ++ try_to_unmap(hpage, ttu); + page_was_mapped = 1; +- /* +- * Leave mapping locked until after subsequent call to +- * remove_migration_ptes() +- */ ++ ++ if (mapping_locked) ++ i_mmap_unlock_write(mapping); + } + + if (!page_mapped(hpage)) + rc = move_to_new_page(new_hpage, hpage, mode); + +- if (page_was_mapped) { ++ if (page_was_mapped) + remove_migration_ptes(hpage, +- rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, true); +- i_mmap_unlock_write(mapping); +- } ++ rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); + + unlock_put_anon: + unlock_page(new_hpage); +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -1413,9 +1413,6 @@ static bool try_to_unmap_one(struct page + /* + * If sharing is possible, start and end will be adjusted + * accordingly. +- * +- * If called for a huge page, caller must hold i_mmap_rwsem +- * in write mode as it is possible to call huge_pmd_unshare. + */ + adjust_range_if_pmd_sharing_possible(vma, &range.start, + &range.end); +@@ -1462,7 +1459,7 @@ static bool try_to_unmap_one(struct page + subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + address = pvmw.address; + +- if (PageHuge(page)) { ++ if (PageHuge(page) && !PageAnon(page)) { + /* + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly diff --git a/queue-5.9/jbd2-fix-up-sparse-warnings-in-checkpoint-code.patch b/queue-5.9/jbd2-fix-up-sparse-warnings-in-checkpoint-code.patch new file mode 100644 index 00000000000..9fe5ffec265 --- /dev/null +++ b/queue-5.9/jbd2-fix-up-sparse-warnings-in-checkpoint-code.patch @@ -0,0 +1,50 @@ +From 05d5233df85e9621597c5838e95235107eb624a2 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sat, 7 Nov 2020 00:00:49 -0500 +Subject: jbd2: fix up sparse warnings in checkpoint code + +From: Theodore Ts'o + +commit 05d5233df85e9621597c5838e95235107eb624a2 upstream. + +Add missing __acquires() and __releases() annotations. Also, in an +"this should never happen" WARN_ON check, if it *does* actually +happen, we need to release j_state_lock since this function is always +supposed to release that lock. Otherwise, things will quickly grind +to a halt after the WARN_ON trips. + +Fixes: 96f1e0974575 ("jbd2: avoid long hold times of j_state_lock...") +Cc: stable@kernel.org +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/jbd2/checkpoint.c | 2 ++ + fs/jbd2/transaction.c | 4 +++- + 2 files changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/jbd2/checkpoint.c ++++ b/fs/jbd2/checkpoint.c +@@ -106,6 +106,8 @@ static int __try_to_free_cp_buf(struct j + * for a checkpoint to free up some space in the log. + */ + void __jbd2_log_wait_for_space(journal_t *journal) ++__acquires(&journal->j_state_lock) ++__releases(&journal->j_state_lock) + { + int nblocks, space_left; + /* assert_spin_locked(&journal->j_state_lock); */ +--- a/fs/jbd2/transaction.c ++++ b/fs/jbd2/transaction.c +@@ -195,8 +195,10 @@ static void wait_transaction_switching(j + DEFINE_WAIT(wait); + + if (WARN_ON(!journal->j_running_transaction || +- journal->j_running_transaction->t_state != T_SWITCH)) ++ journal->j_running_transaction->t_state != T_SWITCH)) { ++ read_unlock(&journal->j_state_lock); + return; ++ } + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + read_unlock(&journal->j_state_lock); diff --git a/queue-5.9/mei-protect-mei_cl_mtu-from-null-dereference.patch b/queue-5.9/mei-protect-mei_cl_mtu-from-null-dereference.patch new file mode 100644 index 00000000000..c2e6c77259b --- /dev/null +++ b/queue-5.9/mei-protect-mei_cl_mtu-from-null-dereference.patch @@ -0,0 +1,41 @@ +From bcbc0b2e275f0a797de11a10eff495b4571863fc Mon Sep 17 00:00:00 2001 +From: Alexander Usyskin +Date: Thu, 29 Oct 2020 11:54:42 +0200 +Subject: mei: protect mei_cl_mtu from null dereference + +From: Alexander Usyskin + +commit bcbc0b2e275f0a797de11a10eff495b4571863fc upstream. + +A receive callback is queued while the client is still connected +but can still be called after the client was disconnected. Upon +disconnect cl->me_cl is set to NULL, hence we need to check +that ME client is not-NULL in mei_cl_mtu to avoid +null dereference. + +Cc: +Signed-off-by: Alexander Usyskin +Signed-off-by: Tomas Winkler +Link: https://lore.kernel.org/r/20201029095444.957924-2-tomas.winkler@intel.com +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/misc/mei/client.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/misc/mei/client.h ++++ b/drivers/misc/mei/client.h +@@ -164,11 +164,11 @@ static inline u8 mei_cl_me_id(const stru + * + * @cl: host client + * +- * Return: mtu ++ * Return: mtu or 0 if client is not connected + */ + static inline size_t mei_cl_mtu(const struct mei_cl *cl) + { +- return cl->me_cl->props.max_msg_length; ++ return cl->me_cl ? cl->me_cl->props.max_msg_length : 0; + } + + /** diff --git a/queue-5.9/mm-compaction-count-pages-and-stop-correctly-during-page-isolation.patch b/queue-5.9/mm-compaction-count-pages-and-stop-correctly-during-page-isolation.patch new file mode 100644 index 00000000000..7cf736372cf --- /dev/null +++ b/queue-5.9/mm-compaction-count-pages-and-stop-correctly-during-page-isolation.patch @@ -0,0 +1,86 @@ +From 38935861d85a4d9a353d1dd5a156c97700e2765d Mon Sep 17 00:00:00 2001 +From: Zi Yan +Date: Fri, 13 Nov 2020 22:51:40 -0800 +Subject: mm/compaction: count pages and stop correctly during page isolation + +From: Zi Yan + +commit 38935861d85a4d9a353d1dd5a156c97700e2765d upstream. + +In isolate_migratepages_block, when cc->alloc_contig is true, we are +able to isolate compound pages. But nr_migratepages and nr_isolated did +not count compound pages correctly, causing us to isolate more pages +than we thought. + +So count compound pages as the number of base pages they contain. +Otherwise, we might be trapped in too_many_isolated while loop, since +the actual isolated pages can go up to COMPACT_CLUSTER_MAX*512=16384, +where COMPACT_CLUSTER_MAX is 32, since we stop isolation after +cc->nr_migratepages reaches to COMPACT_CLUSTER_MAX. + +In addition, after we fix the issue above, cc->nr_migratepages could +never be equal to COMPACT_CLUSTER_MAX if compound pages are isolated, +thus page isolation could not stop as we intended. Change the isolation +stop condition to '>='. + +The issue can be triggered as follows: + +In a system with 16GB memory and an 8GB CMA region reserved by +hugetlb_cma, if we first allocate 10GB THPs and mlock them (so some THPs +are allocated in the CMA region and mlocked), reserving 6 1GB hugetlb +pages via /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages will +get stuck (looping in too_many_isolated function) until we kill either +task. With the patch applied, oom will kill the application with 10GB +THPs and let hugetlb page reservation finish. + +[ziy@nvidia.com: v3] + +Link: https://lkml.kernel.org/r/20201030183809.3616803-1-zi.yan@sent.com +Fixes: 1da2f328fa64 ("cmm,thp,compaction,cma: allow THP migration for CMA allocations") +Signed-off-by: Zi Yan +Signed-off-by: Andrew Morton +Reviewed-by: Yang Shi +Acked-by: Vlastimil Babka +Cc: Rik van Riel +Cc: Michal Hocko +Cc: Mel Gorman +Cc: +Link: https://lkml.kernel.org/r/20201029200435.3386066-1-zi.yan@sent.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/compaction.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -1013,8 +1013,8 @@ isolate_migratepages_block(struct compac + + isolate_success: + list_add(&page->lru, &cc->migratepages); +- cc->nr_migratepages++; +- nr_isolated++; ++ cc->nr_migratepages += compound_nr(page); ++ nr_isolated += compound_nr(page); + + /* + * Avoid isolating too much unless this block is being +@@ -1022,7 +1022,7 @@ isolate_success: + * or a lock is contended. For contention, isolate quickly to + * potentially remove one source of contention. + */ +- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX && ++ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && + !cc->rescan && !cc->contended) { + ++low_pfn; + break; +@@ -1133,7 +1133,7 @@ isolate_migratepages_range(struct compac + if (!pfn) + break; + +- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) ++ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX) + break; + } + diff --git a/queue-5.9/mm-compaction-stop-isolation-if-too-many-pages-are-isolated-and-we-have-pages-to-migrate.patch b/queue-5.9/mm-compaction-stop-isolation-if-too-many-pages-are-isolated-and-we-have-pages-to-migrate.patch new file mode 100644 index 00000000000..c2757b4664c --- /dev/null +++ b/queue-5.9/mm-compaction-stop-isolation-if-too-many-pages-are-isolated-and-we-have-pages-to-migrate.patch @@ -0,0 +1,53 @@ +From d20bdd571ee5c9966191568527ecdb1bd4b52368 Mon Sep 17 00:00:00 2001 +From: Zi Yan +Date: Fri, 13 Nov 2020 22:51:43 -0800 +Subject: mm/compaction: stop isolation if too many pages are isolated and we have pages to migrate +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Zi Yan + +commit d20bdd571ee5c9966191568527ecdb1bd4b52368 upstream. + +In isolate_migratepages_block, if we have too many isolated pages and +nr_migratepages is not zero, we should try to migrate what we have +without wasting time on isolating. + +In theory it's possible that multiple parallel compactions will cause +too_many_isolated() to become true even if each has isolated less than +COMPACT_CLUSTER_MAX, and loop forever in the while loop. Bailing +immediately prevents that. + +[vbabka@suse.cz: changelog addition] + +Fixes: 1da2f328fa64 (“mm,thp,compaction,cma: allow THP migration for CMA allocations”) +Suggested-by: Vlastimil Babka +Signed-off-by: Zi Yan +Signed-off-by: Andrew Morton +Cc: +Cc: Mel Gorman +Cc: Michal Hocko +Cc: Rik van Riel +Cc: Yang Shi +Link: https://lkml.kernel.org/r/20201030183809.3616803-2-zi.yan@sent.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/compaction.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -818,6 +818,10 @@ isolate_migratepages_block(struct compac + * delay for some time until fewer pages are isolated + */ + while (unlikely(too_many_isolated(pgdat))) { ++ /* stop isolation if there are still pages not migrated */ ++ if (cc->nr_migratepages) ++ return 0; ++ + /* async migration should just abort */ + if (cc->mode == MIGRATE_ASYNC) + return 0; diff --git a/queue-5.9/mm-gup-use-unpin_user_pages-in-__gup_longterm_locked.patch b/queue-5.9/mm-gup-use-unpin_user_pages-in-__gup_longterm_locked.patch new file mode 100644 index 00000000000..d1cd8325803 --- /dev/null +++ b/queue-5.9/mm-gup-use-unpin_user_pages-in-__gup_longterm_locked.patch @@ -0,0 +1,66 @@ +From 96e1fac162cc0086c50b2b14062112adb2ba640e Mon Sep 17 00:00:00 2001 +From: Jason Gunthorpe +Date: Fri, 13 Nov 2020 22:51:56 -0800 +Subject: mm/gup: use unpin_user_pages() in __gup_longterm_locked() + +From: Jason Gunthorpe + +commit 96e1fac162cc0086c50b2b14062112adb2ba640e upstream. + +When FOLL_PIN is passed to __get_user_pages() the page list must be put +back using unpin_user_pages() otherwise the page pin reference persists +in a corrupted state. + +There are two places in the unwind of __gup_longterm_locked() that put +the pages back without checking. Normally on error this function would +return the partial page list making this the caller's responsibility, +but in these two cases the caller is not allowed to see these pages at +all. + +Fixes: 3faa52c03f44 ("mm/gup: track FOLL_PIN pages") +Reported-by: Ira Weiny +Signed-off-by: Jason Gunthorpe +Signed-off-by: Andrew Morton +Reviewed-by: Ira Weiny +Reviewed-by: John Hubbard +Cc: Aneesh Kumar K.V +Cc: Dan Williams +Cc: +Link: https://lkml.kernel.org/r/0-v2-3ae7d9d162e2+2a7-gup_cma_fix_jgg@nvidia.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/gup.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1637,8 +1637,11 @@ check_again: + /* + * drop the above get_user_pages reference. + */ +- for (i = 0; i < nr_pages; i++) +- put_page(pages[i]); ++ if (gup_flags & FOLL_PIN) ++ unpin_user_pages(pages, nr_pages); ++ else ++ for (i = 0; i < nr_pages; i++) ++ put_page(pages[i]); + + if (migrate_pages(&cma_page_list, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) { +@@ -1718,8 +1721,11 @@ static long __gup_longterm_locked(struct + goto out; + + if (check_dax_vmas(vmas_tmp, rc)) { +- for (i = 0; i < rc; i++) +- put_page(pages[i]); ++ if (gup_flags & FOLL_PIN) ++ unpin_user_pages(pages, rc); ++ else ++ for (i = 0; i < rc; i++) ++ put_page(pages[i]); + rc = -EOPNOTSUPP; + goto out; + } diff --git a/queue-5.9/mm-slub-fix-panic-in-slab_alloc_node.patch b/queue-5.9/mm-slub-fix-panic-in-slab_alloc_node.patch new file mode 100644 index 00000000000..acb04c3ebcf --- /dev/null +++ b/queue-5.9/mm-slub-fix-panic-in-slab_alloc_node.patch @@ -0,0 +1,126 @@ +From 22e4663e916321b72972c69ca0c6b962f529bd78 Mon Sep 17 00:00:00 2001 +From: Laurent Dufour +Date: Fri, 13 Nov 2020 22:51:53 -0800 +Subject: mm/slub: fix panic in slab_alloc_node() + +From: Laurent Dufour + +commit 22e4663e916321b72972c69ca0c6b962f529bd78 upstream. + +While doing memory hot-unplug operation on a PowerPC VM running 1024 CPUs +with 11TB of ram, I hit the following panic: + + BUG: Kernel NULL pointer dereference on read at 0x00000007 + Faulting instruction address: 0xc000000000456048 + Oops: Kernel access of bad area, sig: 11 [#2] + LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS= 2048 NUMA pSeries + Modules linked in: rpadlpar_io rpaphp + CPU: 160 PID: 1 Comm: systemd Tainted: G D 5.9.0 #1 + NIP: c000000000456048 LR: c000000000455fd4 CTR: c00000000047b350 + REGS: c00006028d1b77a0 TRAP: 0300 Tainted: G D (5.9.0) + MSR: 8000000000009033 CR: 24004228 XER: 00000000 + CFAR: c00000000000f1b0 DAR: 0000000000000007 DSISR: 40000000 IRQMASK: 0 + GPR00: c000000000455fd4 c00006028d1b7a30 c000000001bec800 0000000000000000 + GPR04: 0000000000000dc0 0000000000000000 00000000000374ef c00007c53df99320 + GPR08: 000007c53c980000 0000000000000000 000007c53c980000 0000000000000000 + GPR12: 0000000000004400 c00000001e8e4400 0000000000000000 0000000000000f6a + GPR16: 0000000000000000 c000000001c25930 c000000001d62528 00000000000000c1 + GPR20: c000000001d62538 c00006be469e9000 0000000fffffffe0 c0000000003c0ff8 + GPR24: 0000000000000018 0000000000000000 0000000000000dc0 0000000000000000 + GPR28: c00007c513755700 c000000001c236a4 c00007bc4001f800 0000000000000001 + NIP [c000000000456048] __kmalloc_node+0x108/0x790 + LR [c000000000455fd4] __kmalloc_node+0x94/0x790 + Call Trace: + kvmalloc_node+0x58/0x110 + mem_cgroup_css_online+0x10c/0x270 + online_css+0x48/0xd0 + cgroup_apply_control_enable+0x2c4/0x470 + cgroup_mkdir+0x408/0x5f0 + kernfs_iop_mkdir+0x90/0x100 + vfs_mkdir+0x138/0x250 + do_mkdirat+0x154/0x1c0 + system_call_exception+0xf8/0x200 + system_call_common+0xf0/0x27c + Instruction dump: + e93e0000 e90d0030 39290008 7cc9402a e94d0030 e93e0000 7ce95214 7f89502a + 2fbc0000 419e0018 41920230 e9270010 <89290007> 7f994800 419e0220 7ee6bb78 + +This pointing to the following code: + + mm/slub.c:2851 + if (unlikely(!object || !node_match(page, node))) { + c000000000456038: 00 00 bc 2f cmpdi cr7,r28,0 + c00000000045603c: 18 00 9e 41 beq cr7,c000000000456054 <__kmalloc_node+0x114> + node_match(): + mm/slub.c:2491 + if (node != NUMA_NO_NODE && page_to_nid(page) != node) + c000000000456040: 30 02 92 41 beq cr4,c000000000456270 <__kmalloc_node+0x330> + page_to_nid(): + include/linux/mm.h:1294 + c000000000456044: 10 00 27 e9 ld r9,16(r7) + c000000000456048: 07 00 29 89 lbz r9,7(r9) <<<< r9 = NULL + node_match(): + mm/slub.c:2491 + c00000000045604c: 00 48 99 7f cmpw cr7,r25,r9 + c000000000456050: 20 02 9e 41 beq cr7,c000000000456270 <__kmalloc_node+0x330> + +The panic occurred in slab_alloc_node() when checking for the page's node: + + object = c->freelist; + page = c->page; + if (unlikely(!object || !node_match(page, node))) { + object = __slab_alloc(s, gfpflags, node, addr, c); + stat(s, ALLOC_SLOWPATH); + +The issue is that object is not NULL while page is NULL which is odd but +may happen if the cache flush happened after loading object but before +loading page. Thus checking for the page pointer is required too. + +The cache flush is done through an inter processor interrupt when a +piece of memory is off-lined. That interrupt is triggered when a memory +hot-unplug operation is initiated and offline_pages() is calling the +slub's MEM_GOING_OFFLINE callback slab_mem_going_offline_callback() +which is calling flush_cpu_slab(). If that interrupt is caught between +the reading of c->freelist and the reading of c->page, this could lead +to such a situation. That situation is expected and the later call to +this_cpu_cmpxchg_double() will detect the change to c->freelist and redo +the whole operation. + +In commit 6159d0f5c03e ("mm/slub.c: page is always non-NULL in +node_match()") check on the page pointer has been removed assuming that +page is always valid when it is called. It happens that this is not +true in that particular case, so check for page before calling +node_match() here. + +Fixes: 6159d0f5c03e ("mm/slub.c: page is always non-NULL in node_match()") +Signed-off-by: Laurent Dufour +Signed-off-by: Andrew Morton +Acked-by: Vlastimil Babka +Acked-by: Christoph Lameter +Cc: Wei Yang +Cc: Pekka Enberg +Cc: David Rientjes +Cc: Joonsoo Kim +Cc: Nathan Lynch +Cc: Scott Cheloha +Cc: Michal Hocko +Cc: +Link: https://lkml.kernel.org/r/20201027190406.33283-1-ldufour@linux.ibm.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/slub.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2848,7 +2848,7 @@ redo: + + object = c->freelist; + page = c->page; +- if (unlikely(!object || !node_match(page, node))) { ++ if (unlikely(!object || !page || !node_match(page, node))) { + object = __slab_alloc(s, gfpflags, node, addr, c); + stat(s, ALLOC_SLOWPATH); + } else { diff --git a/queue-5.9/mm-vmscan-fix-nr_isolated_file-corruption-on-64-bit.patch b/queue-5.9/mm-vmscan-fix-nr_isolated_file-corruption-on-64-bit.patch new file mode 100644 index 00000000000..5400b774676 --- /dev/null +++ b/queue-5.9/mm-vmscan-fix-nr_isolated_file-corruption-on-64-bit.patch @@ -0,0 +1,59 @@ +From 2da9f6305f306ffbbb44790675799328fb73119d Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin +Date: Fri, 13 Nov 2020 22:51:46 -0800 +Subject: mm/vmscan: fix NR_ISOLATED_FILE corruption on 64-bit + +From: Nicholas Piggin + +commit 2da9f6305f306ffbbb44790675799328fb73119d upstream. + +Previously the negated unsigned long would be cast back to signed long +which would have the correct negative value. After commit 730ec8c01a2b +("mm/vmscan.c: change prototype for shrink_page_list"), the large +unsigned int converts to a large positive signed long. + +Symptoms include CMA allocations hanging forever holding the cma_mutex +due to alloc_contig_range->...->isolate_migratepages_block waiting +forever in "while (unlikely(too_many_isolated(pgdat)))". + +[akpm@linux-foundation.org: fix -stat.nr_lazyfree_fail as well, per Michal] + +Fixes: 730ec8c01a2b ("mm/vmscan.c: change prototype for shrink_page_list") +Signed-off-by: Nicholas Piggin +Signed-off-by: Andrew Morton +Acked-by: Michal Hocko +Cc: Vaneet Narang +Cc: Maninder Singh +Cc: Amit Sahrawat +Cc: Mel Gorman +Cc: Vlastimil Babka +Cc: +Link: https://lkml.kernel.org/r/20201029032320.1448441-1-npiggin@gmail.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1514,7 +1514,8 @@ unsigned int reclaim_clean_pages_from_li + nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + TTU_IGNORE_ACCESS, &stat, true); + list_splice(&clean_pages, page_list); +- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed); ++ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, ++ -(long)nr_reclaimed); + /* + * Since lazyfree pages are isolated from file LRU from the beginning, + * they will rotate back to anonymous LRU in the end if it failed to +@@ -1524,7 +1525,7 @@ unsigned int reclaim_clean_pages_from_li + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, + stat.nr_lazyfree_fail); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, +- -stat.nr_lazyfree_fail); ++ -(long)stat.nr_lazyfree_fail); + return nr_reclaimed; + } + diff --git a/queue-5.9/ocfs2-initialize-ip_next_orphan.patch b/queue-5.9/ocfs2-initialize-ip_next_orphan.patch new file mode 100644 index 00000000000..fa20adb81cb --- /dev/null +++ b/queue-5.9/ocfs2-initialize-ip_next_orphan.patch @@ -0,0 +1,93 @@ +From f5785283dd64867a711ca1fb1f5bb172f252ecdf Mon Sep 17 00:00:00 2001 +From: Wengang Wang +Date: Fri, 13 Nov 2020 22:52:23 -0800 +Subject: ocfs2: initialize ip_next_orphan + +From: Wengang Wang + +commit f5785283dd64867a711ca1fb1f5bb172f252ecdf upstream. + +Though problem if found on a lower 4.1.12 kernel, I think upstream has +same issue. + +In one node in the cluster, there is the following callback trace: + + # cat /proc/21473/stack + __ocfs2_cluster_lock.isra.36+0x336/0x9e0 [ocfs2] + ocfs2_inode_lock_full_nested+0x121/0x520 [ocfs2] + ocfs2_evict_inode+0x152/0x820 [ocfs2] + evict+0xae/0x1a0 + iput+0x1c6/0x230 + ocfs2_orphan_filldir+0x5d/0x100 [ocfs2] + ocfs2_dir_foreach_blk+0x490/0x4f0 [ocfs2] + ocfs2_dir_foreach+0x29/0x30 [ocfs2] + ocfs2_recover_orphans+0x1b6/0x9a0 [ocfs2] + ocfs2_complete_recovery+0x1de/0x5c0 [ocfs2] + process_one_work+0x169/0x4a0 + worker_thread+0x5b/0x560 + kthread+0xcb/0xf0 + ret_from_fork+0x61/0x90 + +The above stack is not reasonable, the final iput shouldn't happen in +ocfs2_orphan_filldir() function. Looking at the code, + + 2067 /* Skip inodes which are already added to recover list, since dio may + 2068 * happen concurrently with unlink/rename */ + 2069 if (OCFS2_I(iter)->ip_next_orphan) { + 2070 iput(iter); + 2071 return 0; + 2072 } + 2073 + +The logic thinks the inode is already in recover list on seeing +ip_next_orphan is non-NULL, so it skip this inode after dropping a +reference which incremented in ocfs2_iget(). + +While, if the inode is already in recover list, it should have another +reference and the iput() at line 2070 should not be the final iput +(dropping the last reference). So I don't think the inode is really in +the recover list (no vmcore to confirm). + +Note that ocfs2_queue_orphans(), though not shown up in the call back +trace, is holding cluster lock on the orphan directory when looking up +for unlinked inodes. The on disk inode eviction could involve a lot of +IOs which may need long time to finish. That means this node could hold +the cluster lock for very long time, that can lead to the lock requests +(from other nodes) to the orhpan directory hang for long time. + +Looking at more on ip_next_orphan, I found it's not initialized when +allocating a new ocfs2_inode_info structure. + +This causes te reflink operations from some nodes hang for very long +time waiting for the cluster lock on the orphan directory. + +Fix: initialize ip_next_orphan as NULL. + +Signed-off-by: Wengang Wang +Signed-off-by: Andrew Morton +Reviewed-by: Joseph Qi +Cc: Mark Fasheh +Cc: Joel Becker +Cc: Junxiao Bi +Cc: Changwei Ge +Cc: Gang He +Cc: Jun Piao +Cc: +Link: https://lkml.kernel.org/r/20201109171746.27884-1-wen.gang.wang@oracle.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/super.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/ocfs2/super.c ++++ b/fs/ocfs2/super.c +@@ -1713,6 +1713,7 @@ static void ocfs2_inode_init_once(void * + + oi->ip_blkno = 0ULL; + oi->ip_clusters = 0; ++ oi->ip_next_orphan = NULL; + + ocfs2_resv_init_once(&oi->ip_la_data_resv); + diff --git a/queue-5.9/reboot-fix-overflow-parsing-reboot-cpu-number.patch b/queue-5.9/reboot-fix-overflow-parsing-reboot-cpu-number.patch new file mode 100644 index 00000000000..efade5136a7 --- /dev/null +++ b/queue-5.9/reboot-fix-overflow-parsing-reboot-cpu-number.patch @@ -0,0 +1,74 @@ +From df5b0ab3e08a156701b537809914b339b0daa526 Mon Sep 17 00:00:00 2001 +From: Matteo Croce +Date: Fri, 13 Nov 2020 22:52:07 -0800 +Subject: reboot: fix overflow parsing reboot cpu number + +From: Matteo Croce + +commit df5b0ab3e08a156701b537809914b339b0daa526 upstream. + +Limit the CPU number to num_possible_cpus(), because setting it to a +value lower than INT_MAX but higher than NR_CPUS produces the following +error on reboot and shutdown: + + BUG: unable to handle page fault for address: ffffffff90ab1bb0 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 1c09067 P4D 1c09067 PUD 1c0a063 PMD 0 + Oops: 0000 [#1] SMP + CPU: 1 PID: 1 Comm: systemd-shutdow Not tainted 5.9.0-rc8-kvm #110 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 + RIP: 0010:migrate_to_reboot_cpu+0xe/0x60 + Code: ea ea 00 48 89 fa 48 c7 c7 30 57 f1 81 e9 fa ef ff ff 66 2e 0f 1f 84 00 00 00 00 00 53 8b 1d d5 ea ea 00 e8 14 33 fe ff 89 da <48> 0f a3 15 ea fc bd 00 48 89 d0 73 29 89 c2 c1 e8 06 65 48 8b 3c + RSP: 0018:ffffc90000013e08 EFLAGS: 00010246 + RAX: ffff88801f0a0000 RBX: 0000000077359400 RCX: 0000000000000000 + RDX: 0000000077359400 RSI: 0000000000000002 RDI: ffffffff81c199e0 + RBP: ffffffff81c1e3c0 R08: ffff88801f41f000 R09: ffffffff81c1e348 + R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 + R13: 00007f32bedf8830 R14: 00000000fee1dead R15: 0000000000000000 + FS: 00007f32bedf8980(0000) GS:ffff88801f480000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: ffffffff90ab1bb0 CR3: 000000001d057000 CR4: 00000000000006a0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + __do_sys_reboot.cold+0x34/0x5b + do_syscall_64+0x2d/0x40 + +Fixes: 1b3a5d02ee07 ("reboot: move arch/x86 reboot= handling to generic kernel") +Signed-off-by: Matteo Croce +Signed-off-by: Andrew Morton +Cc: Arnd Bergmann +Cc: Fabian Frederick +Cc: Greg Kroah-Hartman +Cc: Guenter Roeck +Cc: Kees Cook +Cc: Mike Rapoport +Cc: Pavel Tatashin +Cc: Petr Mladek +Cc: Robin Holt +Cc: +Link: https://lkml.kernel.org/r/20201103214025.116799-3-mcroce@linux.microsoft.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/reboot.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/kernel/reboot.c ++++ b/kernel/reboot.c +@@ -558,6 +558,13 @@ static int __init reboot_setup(char *str + reboot_cpu = simple_strtoul(str+3, NULL, 0); + else + *mode = REBOOT_SOFT; ++ if (reboot_cpu >= num_possible_cpus()) { ++ pr_err("Ignoring the CPU number in reboot= option. " ++ "CPU %d exceeds possible cpu number %d\n", ++ reboot_cpu, num_possible_cpus()); ++ reboot_cpu = 0; ++ break; ++ } + break; + + case 'g': diff --git a/queue-5.9/revert-kernel-reboot.c-convert-simple_strtoul-to-kstrtoint.patch b/queue-5.9/revert-kernel-reboot.c-convert-simple_strtoul-to-kstrtoint.patch new file mode 100644 index 00000000000..0bb1379abe8 --- /dev/null +++ b/queue-5.9/revert-kernel-reboot.c-convert-simple_strtoul-to-kstrtoint.patch @@ -0,0 +1,86 @@ +From 8b92c4ff4423aa9900cf838d3294fcade4dbda35 Mon Sep 17 00:00:00 2001 +From: Matteo Croce +Date: Fri, 13 Nov 2020 22:52:02 -0800 +Subject: Revert "kernel/reboot.c: convert simple_strtoul to kstrtoint" + +From: Matteo Croce + +commit 8b92c4ff4423aa9900cf838d3294fcade4dbda35 upstream. + +Patch series "fix parsing of reboot= cmdline", v3. + +The parsing of the reboot= cmdline has two major errors: + + - a missing bound check can crash the system on reboot + + - parsing of the cpu number only works if specified last + +Fix both. + +This patch (of 2): + +This reverts commit 616feab753972b97. + +kstrtoint() and simple_strtoul() have a subtle difference which makes +them non interchangeable: if a non digit character is found amid the +parsing, the former will return an error, while the latter will just +stop parsing, e.g. simple_strtoul("123xyx") = 123. + +The kernel cmdline reboot= argument allows to specify the CPU used for +rebooting, with the syntax `s####` among the other flags, e.g. +"reboot=warm,s31,force", so if this flag is not the last given, it's +silently ignored as well as the subsequent ones. + +Fixes: 616feab75397 ("kernel/reboot.c: convert simple_strtoul to kstrtoint") +Signed-off-by: Matteo Croce +Signed-off-by: Andrew Morton +Cc: Guenter Roeck +Cc: Petr Mladek +Cc: Arnd Bergmann +Cc: Mike Rapoport +Cc: Kees Cook +Cc: Pavel Tatashin +Cc: Robin Holt +Cc: Fabian Frederick +Cc: Greg Kroah-Hartman +Cc: +Link: https://lkml.kernel.org/r/20201103214025.116799-2-mcroce@linux.microsoft.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/reboot.c | 21 +++++++-------------- + 1 file changed, 7 insertions(+), 14 deletions(-) + +--- a/kernel/reboot.c ++++ b/kernel/reboot.c +@@ -551,22 +551,15 @@ static int __init reboot_setup(char *str + break; + + case 's': +- { +- int rc; +- +- if (isdigit(*(str+1))) { +- rc = kstrtoint(str+1, 0, &reboot_cpu); +- if (rc) +- return rc; +- } else if (str[1] == 'm' && str[2] == 'p' && +- isdigit(*(str+3))) { +- rc = kstrtoint(str+3, 0, &reboot_cpu); +- if (rc) +- return rc; +- } else ++ if (isdigit(*(str+1))) ++ reboot_cpu = simple_strtoul(str+1, NULL, 0); ++ else if (str[1] == 'm' && str[2] == 'p' && ++ isdigit(*(str+3))) ++ reboot_cpu = simple_strtoul(str+3, NULL, 0); ++ else + *mode = REBOOT_SOFT; + break; +- } ++ + case 'g': + *mode = REBOOT_GPIO; + break; diff --git a/queue-5.9/series b/queue-5.9/series index d5fbe708955..bac52db0db9 100644 --- a/queue-5.9/series +++ b/queue-5.9/series @@ -200,3 +200,19 @@ uio-fix-use-after-free-in-uio_unregister_device.patch revert-usb-musb-convert-to-devm_platform_ioremap_resource_byname.patch usb-cdc-acm-add-disable_echo-for-renesas-usb-download-mode.patch usb-typec-ucsi-report-power-supply-changes.patch +xhci-hisilicon-fix-refercence-leak-in-xhci_histb_probe.patch +virtio-virtio_console-fix-dma-memory-allocation-for-rproc-serial.patch +mei-protect-mei_cl_mtu-from-null-dereference.patch +futex-don-t-enable-irqs-unconditionally-in-put_pi_state.patch +jbd2-fix-up-sparse-warnings-in-checkpoint-code.patch +bootconfig-extend-the-magic-check-range-to-the-preceding-3-bytes.patch +mm-compaction-count-pages-and-stop-correctly-during-page-isolation.patch +mm-compaction-stop-isolation-if-too-many-pages-are-isolated-and-we-have-pages-to-migrate.patch +mm-slub-fix-panic-in-slab_alloc_node.patch +mm-vmscan-fix-nr_isolated_file-corruption-on-64-bit.patch +mm-gup-use-unpin_user_pages-in-__gup_longterm_locked.patch +compiler.h-fix-barrier_data-on-clang.patch +revert-kernel-reboot.c-convert-simple_strtoul-to-kstrtoint.patch +reboot-fix-overflow-parsing-reboot-cpu-number.patch +hugetlbfs-fix-anon-huge-page-migration-race.patch +ocfs2-initialize-ip_next_orphan.patch diff --git a/queue-5.9/virtio-virtio_console-fix-dma-memory-allocation-for-rproc-serial.patch b/queue-5.9/virtio-virtio_console-fix-dma-memory-allocation-for-rproc-serial.patch new file mode 100644 index 00000000000..3b0c7d3434c --- /dev/null +++ b/queue-5.9/virtio-virtio_console-fix-dma-memory-allocation-for-rproc-serial.patch @@ -0,0 +1,83 @@ +From 9d516aa82b7d4fbe7f6303348697960ba03a530b Mon Sep 17 00:00:00 2001 +From: Alexander Lobakin +Date: Wed, 4 Nov 2020 15:31:36 +0000 +Subject: virtio: virtio_console: fix DMA memory allocation for rproc serial + +From: Alexander Lobakin + +commit 9d516aa82b7d4fbe7f6303348697960ba03a530b upstream. + +Since commit 086d08725d34 ("remoteproc: create vdev subdevice with +specific dma memory pool"), every remoteproc has a DMA subdevice +("remoteprocX#vdevYbuffer") for each virtio device, which inherits +DMA capabilities from the corresponding platform device. This allowed +to associate different DMA pools with each vdev, and required from +virtio drivers to perform DMA operations with the parent device +(vdev->dev.parent) instead of grandparent (vdev->dev.parent->parent). + +virtio_rpmsg_bus was already changed in the same merge cycle with +commit d999b622fcfb ("rpmsg: virtio: allocate buffer from parent"), +but virtio_console did not. In fact, operations using the grandparent +worked fine while the grandparent was the platform device, but since +commit c774ad010873 ("remoteproc: Fix and restore the parenting +hierarchy for vdev") this was changed, and now the grandparent device +is the remoteproc device without any DMA capabilities. +So, starting v5.8-rc1 the following warning is observed: + +[ 2.483925] ------------[ cut here ]------------ +[ 2.489148] WARNING: CPU: 3 PID: 101 at kernel/dma/mapping.c:427 0x80e7eee8 +[ 2.489152] Modules linked in: virtio_console(+) +[ 2.503737] virtio_rpmsg_bus rpmsg_core +[ 2.508903] +[ 2.528898] +[ 2.913043] +[ 2.914907] ---[ end trace 93ac8746beab612c ]--- +[ 2.920102] virtio-ports vport1p0: Error allocating inbufs + +kernel/dma/mapping.c:427 is: + +WARN_ON_ONCE(!dev->coherent_dma_mask); + +obviously because the grandparent now is remoteproc dev without any +DMA caps: + +[ 3.104943] Parent: remoteproc0#vdev1buffer, grandparent: remoteproc0 + +Fix this the same way as it was for virtio_rpmsg_bus, using just the +parent device (vdev->dev.parent, "remoteprocX#vdevYbuffer") for DMA +operations. +This also allows now to reserve DMA pools/buffers for rproc serial +via Device Tree. + +Fixes: c774ad010873 ("remoteproc: Fix and restore the parenting hierarchy for vdev") +Cc: stable@vger.kernel.org # 5.1+ +Reviewed-by: Mathieu Poirier +Acked-by: Jason Wang +Signed-off-by: Alexander Lobakin +Date: Thu, 5 Nov 2020 11:10:24 +0800 +Link: https://lore.kernel.org/r/AOKowLclCbOCKxyiJ71WeNyuAAj2q8EUtxrXbyky5E@cp7-web-042.plabs.ch +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/char/virtio_console.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/char/virtio_console.c ++++ b/drivers/char/virtio_console.c +@@ -435,12 +435,12 @@ static struct port_buffer *alloc_buf(str + /* + * Allocate DMA memory from ancestor. When a virtio + * device is created by remoteproc, the DMA memory is +- * associated with the grandparent device: +- * vdev => rproc => platform-dev. ++ * associated with the parent device: ++ * virtioY => remoteprocX#vdevYbuffer. + */ +- if (!vdev->dev.parent || !vdev->dev.parent->parent) ++ buf->dev = vdev->dev.parent; ++ if (!buf->dev) + goto free_buf; +- buf->dev = vdev->dev.parent->parent; + + /* Increase device refcnt to avoid freeing it */ + get_device(buf->dev); diff --git a/queue-5.9/xhci-hisilicon-fix-refercence-leak-in-xhci_histb_probe.patch b/queue-5.9/xhci-hisilicon-fix-refercence-leak-in-xhci_histb_probe.patch new file mode 100644 index 00000000000..40f4c78180b --- /dev/null +++ b/queue-5.9/xhci-hisilicon-fix-refercence-leak-in-xhci_histb_probe.patch @@ -0,0 +1,40 @@ +From 76255470ffa2795a44032e8b3c1ced11d81aa2db Mon Sep 17 00:00:00 2001 +From: Zhang Qilong +Date: Fri, 6 Nov 2020 20:22:21 +0800 +Subject: xhci: hisilicon: fix refercence leak in xhci_histb_probe + +From: Zhang Qilong + +commit 76255470ffa2795a44032e8b3c1ced11d81aa2db upstream. + +pm_runtime_get_sync() will increment pm usage at first and it +will resume the device later. We should decrease the usage count +whetever it succeeded or failed(maybe runtime of the device has +error, or device is in inaccessible state, or other error state). +If we do not call put operation to decrease the reference, it will +result in reference leak in xhci_histb_probe. Moreover, this +device cannot enter the idle state and always stay busy or other +non-idle state later. So we fixed it by jumping to error handling +branch. + +Fixes: c508f41da0788 ("xhci: hisilicon: support HiSilicon STB xHCI host controller") +Signed-off-by: Zhang Qilong +Link: https://lore.kernel.org/r/20201106122221.2304528-1-zhangqilong3@huawei.com +Cc: stable +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/usb/host/xhci-histb.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/usb/host/xhci-histb.c ++++ b/drivers/usb/host/xhci-histb.c +@@ -240,7 +240,7 @@ static int xhci_histb_probe(struct platf + /* Initialize dma_mask and coherent_dma_mask to 32-bits */ + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32)); + if (ret) +- return ret; ++ goto disable_pm; + + hcd = usb_create_hcd(driver, dev, dev_name(dev)); + if (!hcd) { -- 2.47.3