]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.7-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 29 Mar 2024 10:47:32 +0000 (11:47 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 29 Mar 2024 10:47:32 +0000 (11:47 +0100)
added patches:
btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch

queue-6.7/btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch [new file with mode: 0644]
queue-6.7/series

diff --git a/queue-6.7/btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch b/queue-6.7/btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch
new file mode 100644 (file)
index 0000000..70ed785
--- /dev/null
@@ -0,0 +1,241 @@
+From b0ad381fa7690244802aed119b478b4bdafc31dd Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Mon, 12 Feb 2024 11:56:02 -0500
+Subject: btrfs: fix deadlock with fiemap and extent locking
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit b0ad381fa7690244802aed119b478b4bdafc31dd upstream.
+
+While working on the patchset to remove extent locking I got a lockdep
+splat with fiemap and pagefaulting with my new extent lock replacement
+lock.
+
+This deadlock exists with our normal code, we just don't have lockdep
+annotations with the extent locking so we've never noticed it.
+
+Since we're copying the fiemap extent to user space on every iteration
+we have the chance of pagefaulting.  Because we hold the extent lock for
+the entire range we could mkwrite into a range in the file that we have
+mmap'ed.  This would deadlock with the following stack trace
+
+[<0>] lock_extent+0x28d/0x2f0
+[<0>] btrfs_page_mkwrite+0x273/0x8a0
+[<0>] do_page_mkwrite+0x50/0xb0
+[<0>] do_fault+0xc1/0x7b0
+[<0>] __handle_mm_fault+0x2fa/0x460
+[<0>] handle_mm_fault+0xa4/0x330
+[<0>] do_user_addr_fault+0x1f4/0x800
+[<0>] exc_page_fault+0x7c/0x1e0
+[<0>] asm_exc_page_fault+0x26/0x30
+[<0>] rep_movs_alternative+0x33/0x70
+[<0>] _copy_to_user+0x49/0x70
+[<0>] fiemap_fill_next_extent+0xc8/0x120
+[<0>] emit_fiemap_extent+0x4d/0xa0
+[<0>] extent_fiemap+0x7f8/0xad0
+[<0>] btrfs_fiemap+0x49/0x80
+[<0>] __x64_sys_ioctl+0x3e1/0xb50
+[<0>] do_syscall_64+0x94/0x1a0
+[<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76
+
+I wrote an fstest to reproduce this deadlock without my replacement lock
+and verified that the deadlock exists with our existing locking.
+
+To fix this simply don't take the extent lock for the entire duration of
+the fiemap.  This is safe in general because we keep track of where we
+are when we're searching the tree, so if an ordered extent updates in
+the middle of our fiemap call we'll still emit the correct extents
+because we know what offset we were on before.
+
+The only place we maintain the lock is searching delalloc.  Since the
+delalloc stuff can change during writeback we want to lock the extent
+range so we have a consistent view of delalloc at the time we're
+checking to see if we need to set the delalloc flag.
+
+With this patch applied we no longer deadlock with my testcase.
+
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   62 +++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 45 insertions(+), 17 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -2734,16 +2734,34 @@ static int fiemap_process_hole(struct bt
+        * it beyond i_size.
+        */
+       while (cur_offset < end && cur_offset < i_size) {
++              struct extent_state *cached_state = NULL;
+               u64 delalloc_start;
+               u64 delalloc_end;
+               u64 prealloc_start;
++              u64 lockstart;
++              u64 lockend;
+               u64 prealloc_len = 0;
+               bool delalloc;
++              lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize);
++              lockend = round_up(end, inode->root->fs_info->sectorsize);
++
++              /*
++               * We are only locking for the delalloc range because that's the
++               * only thing that can change here.  With fiemap we have a lock
++               * on the inode, so no buffered or direct writes can happen.
++               *
++               * However mmaps and normal page writeback will cause this to
++               * change arbitrarily.  We have to lock the extent lock here to
++               * make sure that nobody messes with the tree while we're doing
++               * btrfs_find_delalloc_in_range.
++               */
++              lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+               delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+                                                       delalloc_cached_state,
+                                                       &delalloc_start,
+                                                       &delalloc_end);
++              unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+               if (!delalloc)
+                       break;
+@@ -2911,15 +2929,15 @@ int extent_fiemap(struct btrfs_inode *in
+                 u64 start, u64 len)
+ {
+       const u64 ino = btrfs_ino(inode);
+-      struct extent_state *cached_state = NULL;
+       struct extent_state *delalloc_cached_state = NULL;
+       struct btrfs_path *path;
+       struct fiemap_cache cache = { 0 };
+       struct btrfs_backref_share_check_ctx *backref_ctx;
+       u64 last_extent_end;
+       u64 prev_extent_end;
+-      u64 lockstart;
+-      u64 lockend;
++      u64 range_start;
++      u64 range_end;
++      const u64 sectorsize = inode->root->fs_info->sectorsize;
+       bool stopped = false;
+       int ret;
+@@ -2930,12 +2948,11 @@ int extent_fiemap(struct btrfs_inode *in
+               goto out;
+       }
+-      lockstart = round_down(start, inode->root->fs_info->sectorsize);
+-      lockend = round_up(start + len, inode->root->fs_info->sectorsize);
+-      prev_extent_end = lockstart;
++      range_start = round_down(start, sectorsize);
++      range_end = round_up(start + len, sectorsize);
++      prev_extent_end = range_start;
+       btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+-      lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+       ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+       if (ret < 0)
+@@ -2943,7 +2960,7 @@ int extent_fiemap(struct btrfs_inode *in
+       btrfs_release_path(path);
+       path->reada = READA_FORWARD;
+-      ret = fiemap_search_slot(inode, path, lockstart);
++      ret = fiemap_search_slot(inode, path, range_start);
+       if (ret < 0) {
+               goto out_unlock;
+       } else if (ret > 0) {
+@@ -2955,7 +2972,7 @@ int extent_fiemap(struct btrfs_inode *in
+               goto check_eof_delalloc;
+       }
+-      while (prev_extent_end < lockend) {
++      while (prev_extent_end < range_end) {
+               struct extent_buffer *leaf = path->nodes[0];
+               struct btrfs_file_extent_item *ei;
+               struct btrfs_key key;
+@@ -2978,19 +2995,19 @@ int extent_fiemap(struct btrfs_inode *in
+                * The first iteration can leave us at an extent item that ends
+                * before our range's start. Move to the next item.
+                */
+-              if (extent_end <= lockstart)
++              if (extent_end <= range_start)
+                       goto next_item;
+               backref_ctx->curr_leaf_bytenr = leaf->start;
+               /* We have in implicit hole (NO_HOLES feature enabled). */
+               if (prev_extent_end < key.offset) {
+-                      const u64 range_end = min(key.offset, lockend) - 1;
++                      const u64 hole_end = min(key.offset, range_end) - 1;
+                       ret = fiemap_process_hole(inode, fieinfo, &cache,
+                                                 &delalloc_cached_state,
+                                                 backref_ctx, 0, 0, 0,
+-                                                prev_extent_end, range_end);
++                                                prev_extent_end, hole_end);
+                       if (ret < 0) {
+                               goto out_unlock;
+                       } else if (ret > 0) {
+@@ -3000,7 +3017,7 @@ int extent_fiemap(struct btrfs_inode *in
+                       }
+                       /* We've reached the end of the fiemap range, stop. */
+-                      if (key.offset >= lockend) {
++                      if (key.offset >= range_end) {
+                               stopped = true;
+                               break;
+                       }
+@@ -3094,29 +3111,41 @@ check_eof_delalloc:
+       btrfs_free_path(path);
+       path = NULL;
+-      if (!stopped && prev_extent_end < lockend) {
++      if (!stopped && prev_extent_end < range_end) {
+               ret = fiemap_process_hole(inode, fieinfo, &cache,
+                                         &delalloc_cached_state, backref_ctx,
+-                                        0, 0, 0, prev_extent_end, lockend - 1);
++                                        0, 0, 0, prev_extent_end, range_end - 1);
+               if (ret < 0)
+                       goto out_unlock;
+-              prev_extent_end = lockend;
++              prev_extent_end = range_end;
+       }
+       if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+               const u64 i_size = i_size_read(&inode->vfs_inode);
+               if (prev_extent_end < i_size) {
++                      struct extent_state *cached_state = NULL;
+                       u64 delalloc_start;
+                       u64 delalloc_end;
++                      u64 lockstart;
++                      u64 lockend;
+                       bool delalloc;
++                      lockstart = round_down(prev_extent_end, sectorsize);
++                      lockend = round_up(i_size, sectorsize);
++
++                      /*
++                       * See the comment in fiemap_process_hole as to why
++                       * we're doing the locking here.
++                       */
++                      lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+                       delalloc = btrfs_find_delalloc_in_range(inode,
+                                                               prev_extent_end,
+                                                               i_size - 1,
+                                                               &delalloc_cached_state,
+                                                               &delalloc_start,
+                                                               &delalloc_end);
++                      unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+                       if (!delalloc)
+                               cache.flags |= FIEMAP_EXTENT_LAST;
+               } else {
+@@ -3127,7 +3156,6 @@ check_eof_delalloc:
+       ret = emit_last_fiemap_cache(fieinfo, &cache);
+ out_unlock:
+-      unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+       btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ out:
+       free_extent_state(delalloc_cached_state);
index acf43da75b50dfa27fe111c9fdf8266a67d4d752..7f1b1654fd63a2d98c9785672ce88ff229f73050 100644 (file)
@@ -292,3 +292,4 @@ x86-efistub-call-mixed-mode-boot-services-on-the-firmware-s-stack.patch
 asoc-amd-yc-revert-fix-non-functional-mic-on-lenovo-21j2.patch
 fix-memory-leak-in-posix_clock_open.patch
 wifi-rtw88-8821cu-fix-connection-failure.patch
+btrfs-fix-deadlock-with-fiemap-and-extent-locking.patch