From: Greg Kroah-Hartman Date: Thu, 10 Dec 2020 12:52:53 +0000 (+0100) Subject: 4.4-stable patches X-Git-Tag: v5.9.14~26 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=4bb3a2c92a10eac4267f69f42fa1f055daa89611;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: btrfs-cleanup-cow-block-on-error.patch mm-userfaultfd-do-not-access-vma-vm_mm-after-calling-handle_userfault.patch tracing-fix-userstacktrace-option-for-instances.patch --- diff --git a/queue-4.4/btrfs-cleanup-cow-block-on-error.patch b/queue-4.4/btrfs-cleanup-cow-block-on-error.patch new file mode 100644 index 00000000000..f1e7494829c --- /dev/null +++ b/queue-4.4/btrfs-cleanup-cow-block-on-error.patch @@ -0,0 +1,136 @@ +From 572c83acdcdafeb04e70aa46be1fa539310be20c Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Tue, 29 Sep 2020 08:53:54 -0400 +Subject: btrfs: cleanup cow block on error + +From: Josef Bacik + +commit 572c83acdcdafeb04e70aa46be1fa539310be20c upstream. + +In fstest btrfs/064 a transaction abort in __btrfs_cow_block could lead +to a system lockup. It gets stuck trying to write back inodes, and the +write back thread was trying to lock an extent buffer: + + $ cat /proc/2143497/stack + [<0>] __btrfs_tree_lock+0x108/0x250 + [<0>] lock_extent_buffer_for_io+0x35e/0x3a0 + [<0>] btree_write_cache_pages+0x15a/0x3b0 + [<0>] do_writepages+0x28/0xb0 + [<0>] __writeback_single_inode+0x54/0x5c0 + [<0>] writeback_sb_inodes+0x1e8/0x510 + [<0>] wb_writeback+0xcc/0x440 + [<0>] wb_workfn+0xd7/0x650 + [<0>] process_one_work+0x236/0x560 + [<0>] worker_thread+0x55/0x3c0 + [<0>] kthread+0x13a/0x150 + [<0>] ret_from_fork+0x1f/0x30 + +This is because we got an error while COWing a block, specifically here + + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + ret = btrfs_reloc_cow_block(trans, root, buf, cow); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + [16402.241552] BTRFS: Transaction aborted (error -2) + [16402.242362] WARNING: CPU: 1 PID: 2563188 at fs/btrfs/ctree.c:1074 __btrfs_cow_block+0x376/0x540 + [16402.249469] CPU: 1 PID: 2563188 Comm: fsstress Not tainted 5.9.0-rc6+ #8 + [16402.249936] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 + [16402.250525] RIP: 0010:__btrfs_cow_block+0x376/0x540 + [16402.252417] RSP: 0018:ffff9cca40e578b0 EFLAGS: 00010282 + [16402.252787] RAX: 0000000000000025 RBX: 0000000000000002 RCX: ffff9132bbd19388 + [16402.253278] RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9132bbd19380 + [16402.254063] RBP: ffff9132b41a49c0 R08: 0000000000000000 R09: 0000000000000000 + [16402.254887] R10: 0000000000000000 R11: ffff91324758b080 R12: ffff91326ef17ce0 + [16402.255694] R13: ffff91325fc0f000 R14: ffff91326ef176b0 R15: ffff9132815e2000 + [16402.256321] FS: 00007f542c6d7b80(0000) GS:ffff9132bbd00000(0000) knlGS:0000000000000000 + [16402.256973] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [16402.257374] CR2: 00007f127b83f250 CR3: 0000000133480002 CR4: 0000000000370ee0 + [16402.257867] Call Trace: + [16402.258072] btrfs_cow_block+0x109/0x230 + [16402.258356] btrfs_search_slot+0x530/0x9d0 + [16402.258655] btrfs_lookup_file_extent+0x37/0x40 + [16402.259155] __btrfs_drop_extents+0x13c/0xd60 + [16402.259628] ? btrfs_block_rsv_migrate+0x4f/0xb0 + [16402.259949] btrfs_replace_file_extents+0x190/0x820 + [16402.260873] btrfs_clone+0x9ae/0xc00 + [16402.261139] btrfs_extent_same_range+0x66/0x90 + [16402.261771] btrfs_remap_file_range+0x353/0x3b1 + [16402.262333] vfs_dedupe_file_range_one.part.0+0xd5/0x140 + [16402.262821] vfs_dedupe_file_range+0x189/0x220 + [16402.263150] do_vfs_ioctl+0x552/0x700 + [16402.263662] __x64_sys_ioctl+0x62/0xb0 + [16402.264023] do_syscall_64+0x33/0x40 + [16402.264364] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [16402.264862] RIP: 0033:0x7f542c7d15cb + [16402.266901] RSP: 002b:00007ffd35944ea8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 + [16402.267627] RAX: ffffffffffffffda RBX: 00000000009d1968 RCX: 00007f542c7d15cb + [16402.268298] RDX: 00000000009d2490 RSI: 00000000c0189436 RDI: 0000000000000003 + [16402.268958] RBP: 00000000009d2520 R08: 0000000000000036 R09: 00000000009d2e64 + [16402.269726] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 + [16402.270659] R13: 000000000001f000 R14: 00000000009d1970 R15: 00000000009d2e80 + [16402.271498] irq event stamp: 0 + [16402.271846] hardirqs last enabled at (0): [<0000000000000000>] 0x0 + [16402.272497] hardirqs last disabled at (0): [] copy_process+0x6b9/0x1ba0 + [16402.273343] softirqs last enabled at (0): [] copy_process+0x6b9/0x1ba0 + [16402.273905] softirqs last disabled at (0): [<0000000000000000>] 0x0 + [16402.274338] ---[ end trace 737874a5a41a8236 ]--- + [16402.274669] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry + [16402.276179] BTRFS info (device dm-9): forced readonly + [16402.277046] BTRFS: error (device dm-9) in btrfs_replace_file_extents:2723: errno=-2 No such entry + [16402.278744] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry + [16402.279968] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry + [16402.280582] BTRFS info (device dm-9): balance: ended with status: -30 + +The problem here is that as soon as we allocate the new block it is +locked and marked dirty in the btree inode. This means that we could +attempt to writeback this block and need to lock the extent buffer. +However we're not unlocking it here and thus we deadlock. + +Fix this by unlocking the cow block if we have any errors inside of +__btrfs_cow_block, and also free it so we do not leak it. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +[sudip: use old btrfs_abort_transaction()] +Signed-off-by: Sudip Mukherjee +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/ctree.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -1129,6 +1129,8 @@ static noinline int __btrfs_cow_block(st + + ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); + if (ret) { ++ btrfs_tree_unlock(cow); ++ free_extent_buffer(cow); + btrfs_abort_transaction(trans, root, ret); + return ret; + } +@@ -1136,6 +1138,8 @@ static noinline int __btrfs_cow_block(st + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { + ret = btrfs_reloc_cow_block(trans, root, buf, cow); + if (ret) { ++ btrfs_tree_unlock(cow); ++ free_extent_buffer(cow); + btrfs_abort_transaction(trans, root, ret); + return ret; + } +@@ -1174,6 +1178,8 @@ static noinline int __btrfs_cow_block(st + if (last_ref) { + ret = tree_mod_log_free_eb(root->fs_info, buf); + if (ret) { ++ btrfs_tree_unlock(cow); ++ free_extent_buffer(cow); + btrfs_abort_transaction(trans, root, ret); + return ret; + } diff --git a/queue-4.4/mm-userfaultfd-do-not-access-vma-vm_mm-after-calling-handle_userfault.patch b/queue-4.4/mm-userfaultfd-do-not-access-vma-vm_mm-after-calling-handle_userfault.patch new file mode 100644 index 00000000000..6865858fb2b --- /dev/null +++ b/queue-4.4/mm-userfaultfd-do-not-access-vma-vm_mm-after-calling-handle_userfault.patch @@ -0,0 +1,165 @@ +From bfe8cc1db02ab243c62780f17fc57f65bde0afe1 Mon Sep 17 00:00:00 2001 +From: Gerald Schaefer +Date: Sat, 21 Nov 2020 22:17:15 -0800 +Subject: mm/userfaultfd: do not access vma->vm_mm after calling handle_userfault() + +From: Gerald Schaefer + +commit bfe8cc1db02ab243c62780f17fc57f65bde0afe1 upstream. + +Alexander reported a syzkaller / KASAN finding on s390, see below for +complete output. + +In do_huge_pmd_anonymous_page(), the pre-allocated pagetable will be +freed in some cases. In the case of userfaultfd_missing(), this will +happen after calling handle_userfault(), which might have released the +mmap_lock. Therefore, the following pte_free(vma->vm_mm, pgtable) will +access an unstable vma->vm_mm, which could have been freed or re-used +already. + +For all architectures other than s390 this will go w/o any negative +impact, because pte_free() simply frees the page and ignores the +passed-in mm. The implementation for SPARC32 would also access +mm->page_table_lock for pte_free(), but there is no THP support in +SPARC32, so the buggy code path will not be used there. + +For s390, the mm->context.pgtable_list is being used to maintain the 2K +pagetable fragments, and operating on an already freed or even re-used +mm could result in various more or less subtle bugs due to list / +pagetable corruption. + +Fix this by calling pte_free() before handle_userfault(), similar to how +it is already done in __do_huge_pmd_anonymous_page() for the WRITE / +non-huge_zero_page case. + +Commit 6b251fc96cf2c ("userfaultfd: call handle_userfault() for +userfaultfd_missing() faults") actually introduced both, the +do_huge_pmd_anonymous_page() and also __do_huge_pmd_anonymous_page() +changes wrt to calling handle_userfault(), but only in the latter case +it put the pte_free() before calling handle_userfault(). + + BUG: KASAN: use-after-free in do_huge_pmd_anonymous_page+0xcda/0xd90 mm/huge_memory.c:744 + Read of size 8 at addr 00000000962d6988 by task syz-executor.0/9334 + + CPU: 1 PID: 9334 Comm: syz-executor.0 Not tainted 5.10.0-rc1-syzkaller-07083-g4c9720875573 #0 + Hardware name: IBM 3906 M04 701 (KVM/Linux) + Call Trace: + do_huge_pmd_anonymous_page+0xcda/0xd90 mm/huge_memory.c:744 + create_huge_pmd mm/memory.c:4256 [inline] + __handle_mm_fault+0xe6e/0x1068 mm/memory.c:4480 + handle_mm_fault+0x288/0x748 mm/memory.c:4607 + do_exception+0x394/0xae0 arch/s390/mm/fault.c:479 + do_dat_exception+0x34/0x80 arch/s390/mm/fault.c:567 + pgm_check_handler+0x1da/0x22c arch/s390/kernel/entry.S:706 + copy_from_user_mvcos arch/s390/lib/uaccess.c:111 [inline] + raw_copy_from_user+0x3a/0x88 arch/s390/lib/uaccess.c:174 + _copy_from_user+0x48/0xa8 lib/usercopy.c:16 + copy_from_user include/linux/uaccess.h:192 [inline] + __do_sys_sigaltstack kernel/signal.c:4064 [inline] + __s390x_sys_sigaltstack+0xc8/0x240 kernel/signal.c:4060 + system_call+0xe0/0x28c arch/s390/kernel/entry.S:415 + + Allocated by task 9334: + slab_alloc_node mm/slub.c:2891 [inline] + slab_alloc mm/slub.c:2899 [inline] + kmem_cache_alloc+0x118/0x348 mm/slub.c:2904 + vm_area_dup+0x9c/0x2b8 kernel/fork.c:356 + __split_vma+0xba/0x560 mm/mmap.c:2742 + split_vma+0xca/0x108 mm/mmap.c:2800 + mlock_fixup+0x4ae/0x600 mm/mlock.c:550 + apply_vma_lock_flags+0x2c6/0x398 mm/mlock.c:619 + do_mlock+0x1aa/0x718 mm/mlock.c:711 + __do_sys_mlock2 mm/mlock.c:738 [inline] + __s390x_sys_mlock2+0x86/0xa8 mm/mlock.c:728 + system_call+0xe0/0x28c arch/s390/kernel/entry.S:415 + + Freed by task 9333: + slab_free mm/slub.c:3142 [inline] + kmem_cache_free+0x7c/0x4b8 mm/slub.c:3158 + __vma_adjust+0x7b2/0x2508 mm/mmap.c:960 + vma_merge+0x87e/0xce0 mm/mmap.c:1209 + userfaultfd_release+0x412/0x6b8 fs/userfaultfd.c:868 + __fput+0x22c/0x7a8 fs/file_table.c:281 + task_work_run+0x200/0x320 kernel/task_work.c:151 + tracehook_notify_resume include/linux/tracehook.h:188 [inline] + do_notify_resume+0x100/0x148 arch/s390/kernel/signal.c:538 + system_call+0xe6/0x28c arch/s390/kernel/entry.S:416 + + The buggy address belongs to the object at 00000000962d6948 which belongs to the cache vm_area_struct of size 200 + The buggy address is located 64 bytes inside of 200-byte region [00000000962d6948, 00000000962d6a10) + The buggy address belongs to the page: page:00000000313a09fe refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x962d6 flags: 0x3ffff00000000200(slab) + raw: 3ffff00000000200 000040000257e080 0000000c0000000c 000000008020ba00 + raw: 0000000000000000 000f001e00000000 ffffffff00000001 0000000096959501 + page dumped because: kasan: bad access detected + page->mem_cgroup:0000000096959501 + + Memory state around the buggy address: + 00000000962d6880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + 00000000962d6900: 00 fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb + >00000000962d6980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ^ + 00000000962d6a00: fb fb fc fc fc fc fc fc fc fc 00 00 00 00 00 00 + 00000000962d6a80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + ================================================================== + +Changes for v4.4 stable: + - Make it apply w/o + * Commit 4cf58924951ef ("mm: treewide: remove unused address argument + from pte_alloc functions") + * Commit 2b7403035459c ("mm: Change return type int to vm_fault_t for + fault handlers") + * Commit 82b0f8c39a386 ("mm: join struct fault_env and vm_fault") + * Commit bae473a423f65 ("mm: introduce fault_env") + * Commit 6fcb52a56ff60 ("thp: reduce usage of huge zero page's atomic counter") + +Fixes: 6b251fc96cf2c ("userfaultfd: call handle_userfault() for userfaultfd_missing() faults") +Reported-by: Alexander Egorenkov +Signed-off-by: Gerald Schaefer +Signed-off-by: Andrew Morton +Cc: Andrea Arcangeli +Cc: Heiko Carstens +Cc: [4.3+] +Link: https://lkml.kernel.org/r/20201110190329.11920-1-gerald.schaefer@linux.ibm.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/huge_memory.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -824,7 +824,6 @@ int do_huge_pmd_anonymous_page(struct mm + spinlock_t *ptl; + pgtable_t pgtable; + struct page *zero_page; +- bool set; + int ret; + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) +@@ -837,10 +836,11 @@ int do_huge_pmd_anonymous_page(struct mm + } + ptl = pmd_lock(mm, pmd); + ret = 0; +- set = false; + if (pmd_none(*pmd)) { + if (userfaultfd_missing(vma)) { + spin_unlock(ptl); ++ pte_free(mm, pgtable); ++ put_huge_zero_page(); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); +@@ -849,11 +849,9 @@ int do_huge_pmd_anonymous_page(struct mm + haddr, pmd, + zero_page); + spin_unlock(ptl); +- set = true; + } +- } else ++ } else { + spin_unlock(ptl); +- if (!set) { + pte_free(mm, pgtable); + put_huge_zero_page(); + } diff --git a/queue-4.4/series b/queue-4.4/series index 8c96e7d8eb5..29cc3d8f8c2 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -31,3 +31,6 @@ iommu-amd-set-dte-to-represent-512-irtes.patch spi-introduce-device-managed-spi-controller-allocation.patch spi-bcm2835-fix-use-after-free-on-unbind.patch spi-bcm2835-release-the-dma-channel-if-probe-fails-after-dma_init.patch +tracing-fix-userstacktrace-option-for-instances.patch +btrfs-cleanup-cow-block-on-error.patch +mm-userfaultfd-do-not-access-vma-vm_mm-after-calling-handle_userfault.patch diff --git a/queue-4.4/tracing-fix-userstacktrace-option-for-instances.patch b/queue-4.4/tracing-fix-userstacktrace-option-for-instances.patch new file mode 100644 index 00000000000..3c72ad8ea01 --- /dev/null +++ b/queue-4.4/tracing-fix-userstacktrace-option-for-instances.patch @@ -0,0 +1,82 @@ +From bcee5278958802b40ee8b26679155a6d9231783e Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (VMware)" +Date: Fri, 4 Dec 2020 16:36:16 -0500 +Subject: tracing: Fix userstacktrace option for instances + +From: Steven Rostedt (VMware) + +commit bcee5278958802b40ee8b26679155a6d9231783e upstream. + +When the instances were able to use their own options, the userstacktrace +option was left hardcoded for the top level. This made the instance +userstacktrace option bascially into a nop, and will confuse users that set +it, but nothing happens (I was confused when it happened to me!) + +Cc: stable@vger.kernel.org +Fixes: 16270145ce6b ("tracing: Add trace options for core options to instances") +Signed-off-by: Steven Rostedt (VMware) +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/trace/trace.c | 9 +++++---- + kernel/trace/trace.h | 6 ++++-- + 2 files changed, 9 insertions(+), 6 deletions(-) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -1706,7 +1706,7 @@ void trace_buffer_unlock_commit(struct t + __buffer_unlock_commit(buffer, event); + + ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); +- ftrace_trace_userstack(buffer, flags, pc); ++ ftrace_trace_userstack(tr, buffer, flags, pc); + } + EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); + +@@ -1768,7 +1768,7 @@ void trace_buffer_unlock_commit_regs(str + * two. They are that meaningful. + */ + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); +- ftrace_trace_userstack(buffer, flags, pc); ++ ftrace_trace_userstack(tr, buffer, flags, pc); + } + EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); + +@@ -1941,14 +1941,15 @@ void trace_dump_stack(int skip) + static DEFINE_PER_CPU(int, user_stack_count); + + void +-ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) ++ftrace_trace_userstack(struct trace_array *tr, ++ struct ring_buffer *buffer, unsigned long flags, int pc) + { + struct trace_event_call *call = &event_user_stack; + struct ring_buffer_event *event; + struct userstack_entry *entry; + struct stack_trace trace; + +- if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) ++ if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) + return; + + /* +--- a/kernel/trace/trace.h ++++ b/kernel/trace/trace.h +@@ -656,13 +656,15 @@ void update_max_tr_single(struct trace_a + #endif /* CONFIG_TRACER_MAX_TRACE */ + + #ifdef CONFIG_STACKTRACE +-void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, ++void ftrace_trace_userstack(struct trace_array *tr, ++ struct ring_buffer *buffer, unsigned long flags, + int pc); + + void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc); + #else +-static inline void ftrace_trace_userstack(struct ring_buffer *buffer, ++static inline void ftrace_trace_userstack(struct trace_array *tr, ++ struct ring_buffer *buffer, + unsigned long flags, int pc) + { + }