]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 10 Sep 2024 07:57:02 +0000 (09:57 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 10 Sep 2024 07:57:02 +0000 (09:57 +0200)
added patches:
btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch

queue-6.6/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch [new file with mode: 0644]
queue-6.6/series

diff --git a/queue-6.6/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch b/queue-6.6/btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch
new file mode 100644 (file)
index 0000000..c04e3e6
--- /dev/null
@@ -0,0 +1,313 @@
+From cd9253c23aedd61eb5ff11f37a36247cd46faf86 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 29 Aug 2024 18:25:49 +0100
+Subject: btrfs: fix race between direct IO write and fsync when using same fd
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit cd9253c23aedd61eb5ff11f37a36247cd46faf86 upstream.
+
+If we have 2 threads that are using the same file descriptor and one of
+them is doing direct IO writes while the other is doing fsync, we have a
+race where we can end up either:
+
+1) Attempt a fsync without holding the inode's lock, triggering an
+   assertion failures when assertions are enabled;
+
+2) Do an invalid memory access from the fsync task because the file private
+   points to memory allocated on stack by the direct IO task and it may be
+   used by the fsync task after the stack was destroyed.
+
+The race happens like this:
+
+1) A user space program opens a file descriptor with O_DIRECT;
+
+2) The program spawns 2 threads using libpthread for example;
+
+3) One of the threads uses the file descriptor to do direct IO writes,
+   while the other calls fsync using the same file descriptor.
+
+4) Call task A the thread doing direct IO writes and task B the thread
+   doing fsyncs;
+
+5) Task A does a direct IO write, and at btrfs_direct_write() sets the
+   file's private to an on stack allocated private with the member
+   'fsync_skip_inode_lock' set to true;
+
+6) Task B enters btrfs_sync_file() and sees that there's a private
+   structure associated to the file which has 'fsync_skip_inode_lock' set
+   to true, so it skips locking the inode's VFS lock;
+
+7) Task A completes the direct IO write, and resets the file's private to
+   NULL since it had no prior private and our private was stack allocated.
+   Then it unlocks the inode's VFS lock;
+
+8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
+   assertion that checks the inode's VFS lock is held fails, since task B
+   never locked it and task A has already unlocked it.
+
+The stack trace produced is the following:
+
+   assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
+   ------------[ cut here ]------------
+   kernel BUG at fs/btrfs/ordered-data.c:983!
+   Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
+   CPU: 9 PID: 5072 Comm: worker Tainted: G     U     OE      6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
+   Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
+   RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
+   Code: 50 d6 86 c0 e8 (...)
+   RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
+   RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
+   RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
+   RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
+   R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
+   R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
+   FS:  00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
+   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+   CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
+   Call Trace:
+    <TASK>
+    ? __die_body.cold+0x14/0x24
+    ? die+0x2e/0x50
+    ? do_trap+0xca/0x110
+    ? do_error_trap+0x6a/0x90
+    ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+    ? exc_invalid_op+0x50/0x70
+    ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+    ? asm_exc_invalid_op+0x1a/0x20
+    ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+    ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+    btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
+    ? __seccomp_filter+0x31d/0x4f0
+    __x64_sys_fdatasync+0x4f/0x90
+    do_syscall_64+0x82/0x160
+    ? do_futex+0xcb/0x190
+    ? __x64_sys_futex+0x10e/0x1d0
+    ? switch_fpu_return+0x4f/0xd0
+    ? syscall_exit_to_user_mode+0x72/0x220
+    ? do_syscall_64+0x8e/0x160
+    ? syscall_exit_to_user_mode+0x72/0x220
+    ? do_syscall_64+0x8e/0x160
+    ? syscall_exit_to_user_mode+0x72/0x220
+    ? do_syscall_64+0x8e/0x160
+    ? syscall_exit_to_user_mode+0x72/0x220
+    ? do_syscall_64+0x8e/0x160
+    entry_SYSCALL_64_after_hwframe+0x76/0x7e
+
+Another problem here is if task B grabs the private pointer and then uses
+it after task A has finished, since the private was allocated in the stack
+of task A, it results in some invalid memory access with a hard to predict
+result.
+
+This issue, triggering the assertion, was observed with QEMU workloads by
+two users in the Link tags below.
+
+Fix this by not relying on a file's private to pass information to fsync
+that it should skip locking the inode and instead pass this information
+through a special value stored in current->journal_info. This is safe
+because in the relevant section of the direct IO write path we are not
+holding a transaction handle, so current->journal_info is NULL.
+
+The following C program triggers the issue:
+
+   $ cat repro.c
+   /* Get the O_DIRECT definition. */
+   #ifndef _GNU_SOURCE
+   #define _GNU_SOURCE
+   #endif
+
+   #include <stdio.h>
+   #include <stdlib.h>
+   #include <unistd.h>
+   #include <stdint.h>
+   #include <fcntl.h>
+   #include <errno.h>
+   #include <string.h>
+   #include <pthread.h>
+
+   static int fd;
+
+   static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
+   {
+       while (count > 0) {
+           ssize_t ret;
+
+           ret = pwrite(fd, buf, count, offset);
+           if (ret < 0) {
+               if (errno == EINTR)
+                   continue;
+               return ret;
+           }
+           count -= ret;
+           buf += ret;
+       }
+       return 0;
+   }
+
+   static void *fsync_loop(void *arg)
+   {
+       while (1) {
+           int ret;
+
+           ret = fsync(fd);
+           if (ret != 0) {
+               perror("Fsync failed");
+               exit(6);
+           }
+       }
+   }
+
+   int main(int argc, char *argv[])
+   {
+       long pagesize;
+       void *write_buf;
+       pthread_t fsyncer;
+       int ret;
+
+       if (argc != 2) {
+           fprintf(stderr, "Use: %s <file path>\n", argv[0]);
+           return 1;
+       }
+
+       fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
+       if (fd == -1) {
+           perror("Failed to open/create file");
+           return 1;
+       }
+
+       pagesize = sysconf(_SC_PAGE_SIZE);
+       if (pagesize == -1) {
+           perror("Failed to get page size");
+           return 2;
+       }
+
+       ret = posix_memalign(&write_buf, pagesize, pagesize);
+       if (ret) {
+           perror("Failed to allocate buffer");
+           return 3;
+       }
+
+       ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
+       if (ret != 0) {
+           fprintf(stderr, "Failed to create writer thread: %d\n", ret);
+           return 4;
+       }
+
+       while (1) {
+           ret = do_write(fd, write_buf, pagesize, 0);
+           if (ret != 0) {
+               perror("Write failed");
+               exit(5);
+           }
+       }
+
+       return 0;
+   }
+
+   $ mkfs.btrfs -f /dev/sdi
+   $ mount /dev/sdi /mnt/sdi
+   $ timeout 10 ./repro /mnt/sdi/foo
+
+Usually the race is triggered within less than 1 second. A test case for
+fstests will follow soon.
+
+Reported-by: Paulo Dias <paulo.miguel.dias@gmail.com>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
+Reported-by: Andreas Jahn <jahn-andi@web.de>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
+Reported-by: syzbot+4704b3cc972bd76024f1@syzkaller.appspotmail.com
+Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
+Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
+CC: stable@vger.kernel.org # 5.15+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h       |    1 -
+ fs/btrfs/file.c        |   25 ++++++++++---------------
+ fs/btrfs/transaction.h |    6 ++++++
+ 3 files changed, 16 insertions(+), 16 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -445,7 +445,6 @@ struct btrfs_file_private {
+       void *filldir_buf;
+       u64 last_index;
+       struct extent_state *llseek_cached_state;
+-      bool fsync_skip_inode_lock;
+ };
+ static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1543,13 +1543,6 @@ again:
+       if (IS_ERR_OR_NULL(dio)) {
+               err = PTR_ERR_OR_ZERO(dio);
+       } else {
+-              struct btrfs_file_private stack_private = { 0 };
+-              struct btrfs_file_private *private;
+-              const bool have_private = (file->private_data != NULL);
+-
+-              if (!have_private)
+-                      file->private_data = &stack_private;
+-
+               /*
+                * If we have a synchoronous write, we must make sure the fsync
+                * triggered by the iomap_dio_complete() call below doesn't
+@@ -1558,13 +1551,10 @@ again:
+                * partial writes due to the input buffer (or parts of it) not
+                * being already faulted in.
+                */
+-              private = file->private_data;
+-              private->fsync_skip_inode_lock = true;
++              ASSERT(current->journal_info == NULL);
++              current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
+               err = iomap_dio_complete(dio);
+-              private->fsync_skip_inode_lock = false;
+-
+-              if (!have_private)
+-                      file->private_data = NULL;
++              current->journal_info = NULL;
+       }
+       /* No increment (+=) because iomap returns a cumulative value. */
+@@ -1796,7 +1786,6 @@ static inline bool skip_inode_logging(co
+  */
+ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+ {
+-      struct btrfs_file_private *private = file->private_data;
+       struct dentry *dentry = file_dentry(file);
+       struct inode *inode = d_inode(dentry);
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+@@ -1806,7 +1795,13 @@ int btrfs_sync_file(struct file *file, l
+       int ret = 0, err;
+       u64 len;
+       bool full_sync;
+-      const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
++      bool skip_ilock = false;
++
++      if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
++              skip_ilock = true;
++              current->journal_info = NULL;
++              lockdep_assert_held(&inode->i_rwsem);
++      }
+       trace_btrfs_sync_file(file, datasync);
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -12,6 +12,12 @@
+ #include "ctree.h"
+ #include "misc.h"
++/*
++ * Signal that a direct IO write is in progress, to avoid deadlock for sync
++ * direct IO writes when fsync is called during the direct IO write path.
++ */
++#define BTRFS_TRANS_DIO_WRITE_STUB    ((void *) 1)
++
+ /* Radix-tree tag for roots that are part of the trasaction. */
+ #define BTRFS_ROOT_TRANS_TAG                  0
index fa99db9943c0bd39022e51127fd85868c168e87c..2716e1f47412c46076642c247ab1a761e2041419 100644 (file)
@@ -262,3 +262,4 @@ riscv-do-not-restrict-memory-size-because-of-linear-.patch
 ublk_drv-fix-null-pointer-dereference-in-ublk_ctrl_s.patch
 membarrier-riscv-add-full-memory-barrier-in-switch_mm.patch
 x86-mm-fix-pti-for-i386-some-more.patch
+btrfs-fix-race-between-direct-io-write-and-fsync-when-using-same-fd.patch