]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 10 Dec 2020 12:52:53 +0000 (13:52 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 10 Dec 2020 12:52:53 +0000 (13:52 +0100)
added patches:
btrfs-cleanup-cow-block-on-error.patch
mm-userfaultfd-do-not-access-vma-vm_mm-after-calling-handle_userfault.patch
tracing-fix-userstacktrace-option-for-instances.patch

queue-4.4/btrfs-cleanup-cow-block-on-error.patch [new file with mode: 0644]
queue-4.4/mm-userfaultfd-do-not-access-vma-vm_mm-after-calling-handle_userfault.patch [new file with mode: 0644]
queue-4.4/series
queue-4.4/tracing-fix-userstacktrace-option-for-instances.patch [new file with mode: 0644]

diff --git a/queue-4.4/btrfs-cleanup-cow-block-on-error.patch b/queue-4.4/btrfs-cleanup-cow-block-on-error.patch
new file mode 100644 (file)
index 0000000..f1e7494
--- /dev/null
@@ -0,0 +1,136 @@
+From 572c83acdcdafeb04e70aa46be1fa539310be20c Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Tue, 29 Sep 2020 08:53:54 -0400
+Subject: btrfs: cleanup cow block on error
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 572c83acdcdafeb04e70aa46be1fa539310be20c upstream.
+
+In fstest btrfs/064 a transaction abort in __btrfs_cow_block could lead
+to a system lockup. It gets stuck trying to write back inodes, and the
+write back thread was trying to lock an extent buffer:
+
+  $ cat /proc/2143497/stack
+  [<0>] __btrfs_tree_lock+0x108/0x250
+  [<0>] lock_extent_buffer_for_io+0x35e/0x3a0
+  [<0>] btree_write_cache_pages+0x15a/0x3b0
+  [<0>] do_writepages+0x28/0xb0
+  [<0>] __writeback_single_inode+0x54/0x5c0
+  [<0>] writeback_sb_inodes+0x1e8/0x510
+  [<0>] wb_writeback+0xcc/0x440
+  [<0>] wb_workfn+0xd7/0x650
+  [<0>] process_one_work+0x236/0x560
+  [<0>] worker_thread+0x55/0x3c0
+  [<0>] kthread+0x13a/0x150
+  [<0>] ret_from_fork+0x1f/0x30
+
+This is because we got an error while COWing a block, specifically here
+
+        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+                if (ret) {
+                        btrfs_abort_transaction(trans, ret);
+                        return ret;
+                }
+        }
+
+  [16402.241552] BTRFS: Transaction aborted (error -2)
+  [16402.242362] WARNING: CPU: 1 PID: 2563188 at fs/btrfs/ctree.c:1074 __btrfs_cow_block+0x376/0x540
+  [16402.249469] CPU: 1 PID: 2563188 Comm: fsstress Not tainted 5.9.0-rc6+ #8
+  [16402.249936] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
+  [16402.250525] RIP: 0010:__btrfs_cow_block+0x376/0x540
+  [16402.252417] RSP: 0018:ffff9cca40e578b0 EFLAGS: 00010282
+  [16402.252787] RAX: 0000000000000025 RBX: 0000000000000002 RCX: ffff9132bbd19388
+  [16402.253278] RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9132bbd19380
+  [16402.254063] RBP: ffff9132b41a49c0 R08: 0000000000000000 R09: 0000000000000000
+  [16402.254887] R10: 0000000000000000 R11: ffff91324758b080 R12: ffff91326ef17ce0
+  [16402.255694] R13: ffff91325fc0f000 R14: ffff91326ef176b0 R15: ffff9132815e2000
+  [16402.256321] FS:  00007f542c6d7b80(0000) GS:ffff9132bbd00000(0000) knlGS:0000000000000000
+  [16402.256973] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  [16402.257374] CR2: 00007f127b83f250 CR3: 0000000133480002 CR4: 0000000000370ee0
+  [16402.257867] Call Trace:
+  [16402.258072]  btrfs_cow_block+0x109/0x230
+  [16402.258356]  btrfs_search_slot+0x530/0x9d0
+  [16402.258655]  btrfs_lookup_file_extent+0x37/0x40
+  [16402.259155]  __btrfs_drop_extents+0x13c/0xd60
+  [16402.259628]  ? btrfs_block_rsv_migrate+0x4f/0xb0
+  [16402.259949]  btrfs_replace_file_extents+0x190/0x820
+  [16402.260873]  btrfs_clone+0x9ae/0xc00
+  [16402.261139]  btrfs_extent_same_range+0x66/0x90
+  [16402.261771]  btrfs_remap_file_range+0x353/0x3b1
+  [16402.262333]  vfs_dedupe_file_range_one.part.0+0xd5/0x140
+  [16402.262821]  vfs_dedupe_file_range+0x189/0x220
+  [16402.263150]  do_vfs_ioctl+0x552/0x700
+  [16402.263662]  __x64_sys_ioctl+0x62/0xb0
+  [16402.264023]  do_syscall_64+0x33/0x40
+  [16402.264364]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [16402.264862] RIP: 0033:0x7f542c7d15cb
+  [16402.266901] RSP: 002b:00007ffd35944ea8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+  [16402.267627] RAX: ffffffffffffffda RBX: 00000000009d1968 RCX: 00007f542c7d15cb
+  [16402.268298] RDX: 00000000009d2490 RSI: 00000000c0189436 RDI: 0000000000000003
+  [16402.268958] RBP: 00000000009d2520 R08: 0000000000000036 R09: 00000000009d2e64
+  [16402.269726] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002
+  [16402.270659] R13: 000000000001f000 R14: 00000000009d1970 R15: 00000000009d2e80
+  [16402.271498] irq event stamp: 0
+  [16402.271846] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
+  [16402.272497] hardirqs last disabled at (0): [<ffffffff910dbf59>] copy_process+0x6b9/0x1ba0
+  [16402.273343] softirqs last  enabled at (0): [<ffffffff910dbf59>] copy_process+0x6b9/0x1ba0
+  [16402.273905] softirqs last disabled at (0): [<0000000000000000>] 0x0
+  [16402.274338] ---[ end trace 737874a5a41a8236 ]---
+  [16402.274669] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry
+  [16402.276179] BTRFS info (device dm-9): forced readonly
+  [16402.277046] BTRFS: error (device dm-9) in btrfs_replace_file_extents:2723: errno=-2 No such entry
+  [16402.278744] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry
+  [16402.279968] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry
+  [16402.280582] BTRFS info (device dm-9): balance: ended with status: -30
+
+The problem here is that as soon as we allocate the new block it is
+locked and marked dirty in the btree inode.  This means that we could
+attempt to writeback this block and need to lock the extent buffer.
+However we're not unlocking it here and thus we deadlock.
+
+Fix this by unlocking the cow block if we have any errors inside of
+__btrfs_cow_block, and also free it so we do not leak it.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+[sudip: use old btrfs_abort_transaction()]
+Signed-off-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -1129,6 +1129,8 @@ static noinline int __btrfs_cow_block(st
+       ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+       if (ret) {
++              btrfs_tree_unlock(cow);
++              free_extent_buffer(cow);
+               btrfs_abort_transaction(trans, root, ret);
+               return ret;
+       }
+@@ -1136,6 +1138,8 @@ static noinline int __btrfs_cow_block(st
+       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+               ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+               if (ret) {
++                      btrfs_tree_unlock(cow);
++                      free_extent_buffer(cow);
+                       btrfs_abort_transaction(trans, root, ret);
+                       return ret;
+               }
+@@ -1174,6 +1178,8 @@ static noinline int __btrfs_cow_block(st
+               if (last_ref) {
+                       ret = tree_mod_log_free_eb(root->fs_info, buf);
+                       if (ret) {
++                              btrfs_tree_unlock(cow);
++                              free_extent_buffer(cow);
+                               btrfs_abort_transaction(trans, root, ret);
+                               return ret;
+                       }
diff --git a/queue-4.4/mm-userfaultfd-do-not-access-vma-vm_mm-after-calling-handle_userfault.patch b/queue-4.4/mm-userfaultfd-do-not-access-vma-vm_mm-after-calling-handle_userfault.patch
new file mode 100644 (file)
index 0000000..6865858
--- /dev/null
@@ -0,0 +1,165 @@
+From bfe8cc1db02ab243c62780f17fc57f65bde0afe1 Mon Sep 17 00:00:00 2001
+From: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
+Date: Sat, 21 Nov 2020 22:17:15 -0800
+Subject: mm/userfaultfd: do not access vma->vm_mm after calling handle_userfault()
+
+From: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
+
+commit bfe8cc1db02ab243c62780f17fc57f65bde0afe1 upstream.
+
+Alexander reported a syzkaller / KASAN finding on s390, see below for
+complete output.
+
+In do_huge_pmd_anonymous_page(), the pre-allocated pagetable will be
+freed in some cases.  In the case of userfaultfd_missing(), this will
+happen after calling handle_userfault(), which might have released the
+mmap_lock.  Therefore, the following pte_free(vma->vm_mm, pgtable) will
+access an unstable vma->vm_mm, which could have been freed or re-used
+already.
+
+For all architectures other than s390 this will go w/o any negative
+impact, because pte_free() simply frees the page and ignores the
+passed-in mm.  The implementation for SPARC32 would also access
+mm->page_table_lock for pte_free(), but there is no THP support in
+SPARC32, so the buggy code path will not be used there.
+
+For s390, the mm->context.pgtable_list is being used to maintain the 2K
+pagetable fragments, and operating on an already freed or even re-used
+mm could result in various more or less subtle bugs due to list /
+pagetable corruption.
+
+Fix this by calling pte_free() before handle_userfault(), similar to how
+it is already done in __do_huge_pmd_anonymous_page() for the WRITE /
+non-huge_zero_page case.
+
+Commit 6b251fc96cf2c ("userfaultfd: call handle_userfault() for
+userfaultfd_missing() faults") actually introduced both, the
+do_huge_pmd_anonymous_page() and also __do_huge_pmd_anonymous_page()
+changes wrt to calling handle_userfault(), but only in the latter case
+it put the pte_free() before calling handle_userfault().
+
+  BUG: KASAN: use-after-free in do_huge_pmd_anonymous_page+0xcda/0xd90 mm/huge_memory.c:744
+  Read of size 8 at addr 00000000962d6988 by task syz-executor.0/9334
+
+  CPU: 1 PID: 9334 Comm: syz-executor.0 Not tainted 5.10.0-rc1-syzkaller-07083-g4c9720875573 #0
+  Hardware name: IBM 3906 M04 701 (KVM/Linux)
+  Call Trace:
+    do_huge_pmd_anonymous_page+0xcda/0xd90 mm/huge_memory.c:744
+    create_huge_pmd mm/memory.c:4256 [inline]
+    __handle_mm_fault+0xe6e/0x1068 mm/memory.c:4480
+    handle_mm_fault+0x288/0x748 mm/memory.c:4607
+    do_exception+0x394/0xae0 arch/s390/mm/fault.c:479
+    do_dat_exception+0x34/0x80 arch/s390/mm/fault.c:567
+    pgm_check_handler+0x1da/0x22c arch/s390/kernel/entry.S:706
+    copy_from_user_mvcos arch/s390/lib/uaccess.c:111 [inline]
+    raw_copy_from_user+0x3a/0x88 arch/s390/lib/uaccess.c:174
+    _copy_from_user+0x48/0xa8 lib/usercopy.c:16
+    copy_from_user include/linux/uaccess.h:192 [inline]
+    __do_sys_sigaltstack kernel/signal.c:4064 [inline]
+    __s390x_sys_sigaltstack+0xc8/0x240 kernel/signal.c:4060
+    system_call+0xe0/0x28c arch/s390/kernel/entry.S:415
+
+  Allocated by task 9334:
+    slab_alloc_node mm/slub.c:2891 [inline]
+    slab_alloc mm/slub.c:2899 [inline]
+    kmem_cache_alloc+0x118/0x348 mm/slub.c:2904
+    vm_area_dup+0x9c/0x2b8 kernel/fork.c:356
+    __split_vma+0xba/0x560 mm/mmap.c:2742
+    split_vma+0xca/0x108 mm/mmap.c:2800
+    mlock_fixup+0x4ae/0x600 mm/mlock.c:550
+    apply_vma_lock_flags+0x2c6/0x398 mm/mlock.c:619
+    do_mlock+0x1aa/0x718 mm/mlock.c:711
+    __do_sys_mlock2 mm/mlock.c:738 [inline]
+    __s390x_sys_mlock2+0x86/0xa8 mm/mlock.c:728
+    system_call+0xe0/0x28c arch/s390/kernel/entry.S:415
+
+  Freed by task 9333:
+    slab_free mm/slub.c:3142 [inline]
+    kmem_cache_free+0x7c/0x4b8 mm/slub.c:3158
+    __vma_adjust+0x7b2/0x2508 mm/mmap.c:960
+    vma_merge+0x87e/0xce0 mm/mmap.c:1209
+    userfaultfd_release+0x412/0x6b8 fs/userfaultfd.c:868
+    __fput+0x22c/0x7a8 fs/file_table.c:281
+    task_work_run+0x200/0x320 kernel/task_work.c:151
+    tracehook_notify_resume include/linux/tracehook.h:188 [inline]
+    do_notify_resume+0x100/0x148 arch/s390/kernel/signal.c:538
+    system_call+0xe6/0x28c arch/s390/kernel/entry.S:416
+
+  The buggy address belongs to the object at 00000000962d6948 which belongs to the cache vm_area_struct of size 200
+  The buggy address is located 64 bytes inside of 200-byte region [00000000962d6948, 00000000962d6a10)
+  The buggy address belongs to the page: page:00000000313a09fe refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x962d6 flags: 0x3ffff00000000200(slab)
+  raw: 3ffff00000000200 000040000257e080 0000000c0000000c 000000008020ba00
+  raw: 0000000000000000 000f001e00000000 ffffffff00000001 0000000096959501
+  page dumped because: kasan: bad access detected
+  page->mem_cgroup:0000000096959501
+
+  Memory state around the buggy address:
+   00000000962d6880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+   00000000962d6900: 00 fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb
+  >00000000962d6980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+                        ^
+   00000000962d6a00: fb fb fc fc fc fc fc fc fc fc 00 00 00 00 00 00
+   00000000962d6a80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+  ==================================================================
+
+Changes for v4.4 stable:
+  - Make it apply w/o
+    * Commit 4cf58924951ef ("mm: treewide: remove unused address argument
+      from pte_alloc functions")
+    * Commit 2b7403035459c ("mm: Change return type int to vm_fault_t for
+      fault handlers")
+    * Commit 82b0f8c39a386 ("mm: join struct fault_env and vm_fault")
+    * Commit bae473a423f65 ("mm: introduce fault_env")
+    * Commit 6fcb52a56ff60 ("thp: reduce usage of huge zero page's atomic counter")
+
+Fixes: 6b251fc96cf2c ("userfaultfd: call handle_userfault() for userfaultfd_missing() faults")
+Reported-by: Alexander Egorenkov <egorenar@linux.ibm.com>
+Signed-off-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Heiko Carstens <hca@linux.ibm.com>
+Cc: <stable@vger.kernel.org>   [4.3+]
+Link: https://lkml.kernel.org/r/20201110190329.11920-1-gerald.schaefer@linux.ibm.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |    8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -824,7 +824,6 @@ int do_huge_pmd_anonymous_page(struct mm
+               spinlock_t *ptl;
+               pgtable_t pgtable;
+               struct page *zero_page;
+-              bool set;
+               int ret;
+               pgtable = pte_alloc_one(mm, haddr);
+               if (unlikely(!pgtable))
+@@ -837,10 +836,11 @@ int do_huge_pmd_anonymous_page(struct mm
+               }
+               ptl = pmd_lock(mm, pmd);
+               ret = 0;
+-              set = false;
+               if (pmd_none(*pmd)) {
+                       if (userfaultfd_missing(vma)) {
+                               spin_unlock(ptl);
++                              pte_free(mm, pgtable);
++                              put_huge_zero_page();
+                               ret = handle_userfault(vma, address, flags,
+                                                      VM_UFFD_MISSING);
+                               VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+@@ -849,11 +849,9 @@ int do_huge_pmd_anonymous_page(struct mm
+                                                  haddr, pmd,
+                                                  zero_page);
+                               spin_unlock(ptl);
+-                              set = true;
+                       }
+-              } else
++              } else {
+                       spin_unlock(ptl);
+-              if (!set) {
+                       pte_free(mm, pgtable);
+                       put_huge_zero_page();
+               }
index 8c96e7d8eb54b10fae17e640a21e3a757ba9e00b..29cc3d8f8c2735392966a5cf5afcc8483c622fd8 100644 (file)
@@ -31,3 +31,6 @@ iommu-amd-set-dte-to-represent-512-irtes.patch
 spi-introduce-device-managed-spi-controller-allocation.patch
 spi-bcm2835-fix-use-after-free-on-unbind.patch
 spi-bcm2835-release-the-dma-channel-if-probe-fails-after-dma_init.patch
+tracing-fix-userstacktrace-option-for-instances.patch
+btrfs-cleanup-cow-block-on-error.patch
+mm-userfaultfd-do-not-access-vma-vm_mm-after-calling-handle_userfault.patch
diff --git a/queue-4.4/tracing-fix-userstacktrace-option-for-instances.patch b/queue-4.4/tracing-fix-userstacktrace-option-for-instances.patch
new file mode 100644 (file)
index 0000000..3c72ad8
--- /dev/null
@@ -0,0 +1,82 @@
+From bcee5278958802b40ee8b26679155a6d9231783e Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Fri, 4 Dec 2020 16:36:16 -0500
+Subject: tracing: Fix userstacktrace option for instances
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit bcee5278958802b40ee8b26679155a6d9231783e upstream.
+
+When the instances were able to use their own options, the userstacktrace
+option was left hardcoded for the top level. This made the instance
+userstacktrace option bascially into a nop, and will confuse users that set
+it, but nothing happens (I was confused when it happened to me!)
+
+Cc: stable@vger.kernel.org
+Fixes: 16270145ce6b ("tracing: Add trace options for core options to instances")
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/trace/trace.c |    9 +++++----
+ kernel/trace/trace.h |    6 ++++--
+ 2 files changed, 9 insertions(+), 6 deletions(-)
+
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -1706,7 +1706,7 @@ void trace_buffer_unlock_commit(struct t
+       __buffer_unlock_commit(buffer, event);
+       ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
+-      ftrace_trace_userstack(buffer, flags, pc);
++      ftrace_trace_userstack(tr, buffer, flags, pc);
+ }
+ EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
+@@ -1768,7 +1768,7 @@ void trace_buffer_unlock_commit_regs(str
+        * two. They are that meaningful.
+        */
+       ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
+-      ftrace_trace_userstack(buffer, flags, pc);
++      ftrace_trace_userstack(tr, buffer, flags, pc);
+ }
+ EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
+@@ -1941,14 +1941,15 @@ void trace_dump_stack(int skip)
+ static DEFINE_PER_CPU(int, user_stack_count);
+ void
+-ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
++ftrace_trace_userstack(struct trace_array *tr,
++                     struct ring_buffer *buffer, unsigned long flags, int pc)
+ {
+       struct trace_event_call *call = &event_user_stack;
+       struct ring_buffer_event *event;
+       struct userstack_entry *entry;
+       struct stack_trace trace;
+-      if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
++      if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE))
+               return;
+       /*
+--- a/kernel/trace/trace.h
++++ b/kernel/trace/trace.h
+@@ -656,13 +656,15 @@ void update_max_tr_single(struct trace_a
+ #endif /* CONFIG_TRACER_MAX_TRACE */
+ #ifdef CONFIG_STACKTRACE
+-void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
++void ftrace_trace_userstack(struct trace_array *tr,
++                          struct ring_buffer *buffer, unsigned long flags,
+                           int pc);
+ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+                  int pc);
+ #else
+-static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
++static inline void ftrace_trace_userstack(struct trace_array *tr,
++                                        struct ring_buffer *buffer,
+                                         unsigned long flags, int pc)
+ {
+ }