--- /dev/null
+From d92b83f592d810aded2e5f90db5f560cc8cf577b Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Mon, 15 Jan 2024 14:15:26 -0500
+Subject: bcachefs: bch2_kthread_io_clock_wait() no longer sleeps until full amount
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+commit d92b83f592d810aded2e5f90db5f560cc8cf577b upstream.
+
+Drop t he loop in bch2_kthread_io_clock_wait(): this allows the code
+that uses it to be woken up for other reasons, and fixes a bug where
+rebalance wouldn't wake up when a scan was requested.
+
+This raises the possibility of spurious wakeups, but callers should
+always be able to handle that reasonably well.
+
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/clock.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/bcachefs/clock.c
++++ b/fs/bcachefs/clock.c
+@@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct i
+ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
+
+- while (1) {
++ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread && kthread_should_stop())
+ break;
+@@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct i
+
+ schedule();
+ try_to_freeze();
+- }
++ } while (0);
+
+ __set_current_state(TASK_RUNNING);
+ del_timer_sync(&wait.cpu_timer);
--- /dev/null
+From 3e44f325f6f75078cdcd44cd337f517ba3650d05 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Thu, 11 Jan 2024 08:36:55 +0100
+Subject: bcachefs: fix incorrect usage of REQ_OP_FLUSH
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 3e44f325f6f75078cdcd44cd337f517ba3650d05 upstream.
+
+REQ_OP_FLUSH is only for internal use in the blk-mq and request based
+drivers. File systems and other block layer consumers must use
+REQ_OP_WRITE | REQ_PREFLUSH as documented in
+Documentation/block/writeback_cache_control.rst.
+
+While REQ_OP_FLUSH appears to work for blk-mq drivers it does not
+get the proper flush state machine handling, and completely fails
+for any bio based drivers, including all the stacking drivers. The
+block layer will also get a check in 6.8 to reject this use case
+entirely.
+
+[Note: completely untested, but as this never got fixed since the
+original bug report in November:
+
+ https://bugzilla.kernel.org/show_bug.cgi?id=218184
+
+and the the discussion in December:
+
+ https://lore.kernel.org/all/20231221053016.72cqcfg46vxwohcj@moria.home.lan/T/
+
+this seems to be best way to force it]
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/fs-io.c | 2 +-
+ fs/bcachefs/journal_io.c | 3 ++-
+ 2 files changed, 3 insertions(+), 2 deletions(-)
+
+--- a/fs/bcachefs/fs-io.c
++++ b/fs/bcachefs/fs-io.c
+@@ -79,7 +79,7 @@ void bch2_inode_flush_nocow_writes_async
+ continue;
+
+ bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
+- REQ_OP_FLUSH,
++ REQ_OP_WRITE|REQ_PREFLUSH,
+ GFP_KERNEL,
+ &c->nocow_flush_bioset),
+ struct nocow_flush, bio);
+--- a/fs/bcachefs/journal_io.c
++++ b/fs/bcachefs/journal_io.c
+@@ -1948,7 +1948,8 @@ CLOSURE_CALLBACK(bch2_journal_write)
+ percpu_ref_get(&ca->io_ref);
+
+ bio = ca->journal.bio;
+- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
++ bio_reset(bio, ca->disk_sb.bdev,
++ REQ_OP_WRITE|REQ_PREFLUSH);
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
--- /dev/null
+From 2acc59dd88d27ad69b66ded80df16c042b04eeec Mon Sep 17 00:00:00 2001
+From: Su Yue <glass.su@suse.com>
+Date: Mon, 15 Jan 2024 10:21:25 +0800
+Subject: bcachefs: grab s_umount only if snapshotting
+
+From: Su Yue <glass.su@suse.com>
+
+commit 2acc59dd88d27ad69b66ded80df16c042b04eeec upstream.
+
+When I was testing mongodb over bcachefs with compression,
+there is a lockdep warning when snapshotting mongodb data volume.
+
+$ cat test.sh
+prog=bcachefs
+
+$prog subvolume create /mnt/data
+$prog subvolume create /mnt/data/snapshots
+
+while true;do
+ $prog subvolume snapshot /mnt/data /mnt/data/snapshots/$(date +%s)
+ sleep 1s
+done
+
+$ cat /etc/mongodb.conf
+systemLog:
+ destination: file
+ logAppend: true
+ path: /mnt/data/mongod.log
+
+storage:
+ dbPath: /mnt/data/
+
+lockdep reports:
+[ 3437.452330] ======================================================
+[ 3437.452750] WARNING: possible circular locking dependency detected
+[ 3437.453168] 6.7.0-rc7-custom+ #85 Tainted: G E
+[ 3437.453562] ------------------------------------------------------
+[ 3437.453981] bcachefs/35533 is trying to acquire lock:
+[ 3437.454325] ffffa0a02b2b1418 (sb_writers#10){.+.+}-{0:0}, at: filename_create+0x62/0x190
+[ 3437.454875]
+ but task is already holding lock:
+[ 3437.455268] ffffa0a02b2b10e0 (&type->s_umount_key#48){.+.+}-{3:3}, at: bch2_fs_file_ioctl+0x232/0xc90 [bcachefs]
+[ 3437.456009]
+ which lock already depends on the new lock.
+
+[ 3437.456553]
+ the existing dependency chain (in reverse order) is:
+[ 3437.457054]
+ -> #3 (&type->s_umount_key#48){.+.+}-{3:3}:
+[ 3437.457507] down_read+0x3e/0x170
+[ 3437.457772] bch2_fs_file_ioctl+0x232/0xc90 [bcachefs]
+[ 3437.458206] __x64_sys_ioctl+0x93/0xd0
+[ 3437.458498] do_syscall_64+0x42/0xf0
+[ 3437.458779] entry_SYSCALL_64_after_hwframe+0x6e/0x76
+[ 3437.459155]
+ -> #2 (&c->snapshot_create_lock){++++}-{3:3}:
+[ 3437.459615] down_read+0x3e/0x170
+[ 3437.459878] bch2_truncate+0x82/0x110 [bcachefs]
+[ 3437.460276] bchfs_truncate+0x254/0x3c0 [bcachefs]
+[ 3437.460686] notify_change+0x1f1/0x4a0
+[ 3437.461283] do_truncate+0x7f/0xd0
+[ 3437.461555] path_openat+0xa57/0xce0
+[ 3437.461836] do_filp_open+0xb4/0x160
+[ 3437.462116] do_sys_openat2+0x91/0xc0
+[ 3437.462402] __x64_sys_openat+0x53/0xa0
+[ 3437.462701] do_syscall_64+0x42/0xf0
+[ 3437.462982] entry_SYSCALL_64_after_hwframe+0x6e/0x76
+[ 3437.463359]
+ -> #1 (&sb->s_type->i_mutex_key#15){+.+.}-{3:3}:
+[ 3437.463843] down_write+0x3b/0xc0
+[ 3437.464223] bch2_write_iter+0x5b/0xcc0 [bcachefs]
+[ 3437.464493] vfs_write+0x21b/0x4c0
+[ 3437.464653] ksys_write+0x69/0xf0
+[ 3437.464839] do_syscall_64+0x42/0xf0
+[ 3437.465009] entry_SYSCALL_64_after_hwframe+0x6e/0x76
+[ 3437.465231]
+ -> #0 (sb_writers#10){.+.+}-{0:0}:
+[ 3437.465471] __lock_acquire+0x1455/0x21b0
+[ 3437.465656] lock_acquire+0xc6/0x2b0
+[ 3437.465822] mnt_want_write+0x46/0x1a0
+[ 3437.465996] filename_create+0x62/0x190
+[ 3437.466175] user_path_create+0x2d/0x50
+[ 3437.466352] bch2_fs_file_ioctl+0x2ec/0xc90 [bcachefs]
+[ 3437.466617] __x64_sys_ioctl+0x93/0xd0
+[ 3437.466791] do_syscall_64+0x42/0xf0
+[ 3437.466957] entry_SYSCALL_64_after_hwframe+0x6e/0x76
+[ 3437.467180]
+ other info that might help us debug this:
+
+[ 3437.469670] 2 locks held by bcachefs/35533:
+ other info that might help us debug this:
+
+[ 3437.467507] Chain exists of:
+ sb_writers#10 --> &c->snapshot_create_lock --> &type->s_umount_key#48
+
+[ 3437.467979] Possible unsafe locking scenario:
+
+[ 3437.468223] CPU0 CPU1
+[ 3437.468405] ---- ----
+[ 3437.468585] rlock(&type->s_umount_key#48);
+[ 3437.468758] lock(&c->snapshot_create_lock);
+[ 3437.469030] lock(&type->s_umount_key#48);
+[ 3437.469291] rlock(sb_writers#10);
+[ 3437.469434]
+ *** DEADLOCK ***
+
+[ 3437.469670] 2 locks held by bcachefs/35533:
+[ 3437.469838] #0: ffffa0a02ce00a88 (&c->snapshot_create_lock){++++}-{3:3}, at: bch2_fs_file_ioctl+0x1e3/0xc90 [bcachefs]
+[ 3437.470294] #1: ffffa0a02b2b10e0 (&type->s_umount_key#48){.+.+}-{3:3}, at: bch2_fs_file_ioctl+0x232/0xc90 [bcachefs]
+[ 3437.470744]
+ stack backtrace:
+[ 3437.470922] CPU: 7 PID: 35533 Comm: bcachefs Kdump: loaded Tainted: G E 6.7.0-rc7-custom+ #85
+[ 3437.471313] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
+[ 3437.471694] Call Trace:
+[ 3437.471795] <TASK>
+[ 3437.471884] dump_stack_lvl+0x57/0x90
+[ 3437.472035] check_noncircular+0x132/0x150
+[ 3437.472202] __lock_acquire+0x1455/0x21b0
+[ 3437.472369] lock_acquire+0xc6/0x2b0
+[ 3437.472518] ? filename_create+0x62/0x190
+[ 3437.472683] ? lock_is_held_type+0x97/0x110
+[ 3437.472856] mnt_want_write+0x46/0x1a0
+[ 3437.473025] ? filename_create+0x62/0x190
+[ 3437.473204] filename_create+0x62/0x190
+[ 3437.473380] user_path_create+0x2d/0x50
+[ 3437.473555] bch2_fs_file_ioctl+0x2ec/0xc90 [bcachefs]
+[ 3437.473819] ? lock_acquire+0xc6/0x2b0
+[ 3437.474002] ? __fget_files+0x2a/0x190
+[ 3437.474195] ? __fget_files+0xbc/0x190
+[ 3437.474380] ? lock_release+0xc5/0x270
+[ 3437.474567] ? __x64_sys_ioctl+0x93/0xd0
+[ 3437.474764] ? __pfx_bch2_fs_file_ioctl+0x10/0x10 [bcachefs]
+[ 3437.475090] __x64_sys_ioctl+0x93/0xd0
+[ 3437.475277] do_syscall_64+0x42/0xf0
+[ 3437.475454] entry_SYSCALL_64_after_hwframe+0x6e/0x76
+[ 3437.475691] RIP: 0033:0x7f2743c313af
+======================================================
+
+In __bch2_ioctl_subvolume_create(), we grab s_umount unconditionally
+and unlock it at the end of the function. There is a comment
+"why do we need this lock?" about the lock coming from
+commit 42d237320e98 ("bcachefs: Snapshot creation, deletion")
+The reason is that __bch2_ioctl_subvolume_create() calls
+sync_inodes_sb() which enforce locked s_umount to writeback all dirty
+nodes before doing snapshot works.
+
+Fix it by read locking s_umount for snapshotting only and unlocking
+s_umount after sync_inodes_sb().
+
+Signed-off-by: Su Yue <glass.su@suse.com>
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/fs-ioctl.c | 11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+--- a/fs/bcachefs/fs-ioctl.c
++++ b/fs/bcachefs/fs-ioctl.c
+@@ -345,11 +345,12 @@ static long __bch2_ioctl_subvolume_creat
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+ create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+- /* why do we need this lock? */
+- down_read(&c->vfs_sb->s_umount);
+-
+- if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
++ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
++ /* sync_inodes_sb enforce s_umount is locked */
++ down_read(&c->vfs_sb->s_umount);
+ sync_inodes_sb(c->vfs_sb);
++ up_read(&c->vfs_sb->s_umount);
++ }
+ retry:
+ if (arg.src_ptr) {
+ error = user_path_at(arg.dirfd,
+@@ -433,8 +434,6 @@ err2:
+ goto retry;
+ }
+ err1:
+- up_read(&c->vfs_sb->s_umount);
+-
+ return error;
+ }
+
--- /dev/null
+From 369acf97d6fd5da620d053d0f1878ffe32eff555 Mon Sep 17 00:00:00 2001
+From: Su Yue <glass.su@suse.com>
+Date: Tue, 16 Jan 2024 19:05:37 +0800
+Subject: bcachefs: kvfree bch_fs::snapshots in bch2_fs_snapshots_exit
+
+From: Su Yue <glass.su@suse.com>
+
+commit 369acf97d6fd5da620d053d0f1878ffe32eff555 upstream.
+
+bch_fs::snapshots is allocated by kvzalloc in __snapshot_t_mut.
+It should be freed by kvfree not kfree.
+Or umount will triger:
+
+[ 406.829178 ] BUG: unable to handle page fault for address: ffffe7b487148008
+[ 406.830676 ] #PF: supervisor read access in kernel mode
+[ 406.831643 ] #PF: error_code(0x0000) - not-present page
+[ 406.832487 ] PGD 0 P4D 0
+[ 406.832898 ] Oops: 0000 [#1] PREEMPT SMP PTI
+[ 406.833512 ] CPU: 2 PID: 1754 Comm: umount Kdump: loaded Tainted: G OE 6.7.0-rc7-custom+ #90
+[ 406.834746 ] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
+[ 406.835796 ] RIP: 0010:kfree+0x62/0x140
+[ 406.836197 ] Code: 80 48 01 d8 0f 82 e9 00 00 00 48 c7 c2 00 00 00 80 48 2b 15 78 9f 1f 01 48 01 d0 48 c1 e8 0c 48 c1 e0 06 48 03 05 56 9f 1f 01 <48> 8b 50 08 48 89 c7 f6 c2 01 0f 85 b0 00 00 00 66 90 48 8b 07 f6
+[ 406.837810 ] RSP: 0018:ffffb9d641607e48 EFLAGS: 00010286
+[ 406.838213 ] RAX: ffffe7b487148000 RBX: ffffb9d645200000 RCX: ffffb9d641607dc4
+[ 406.838738 ] RDX: 000065bb00000000 RSI: ffffffffc0d88b84 RDI: ffffb9d645200000
+[ 406.839217 ] RBP: ffff9a4625d00068 R08: 0000000000000001 R09: 0000000000000001
+[ 406.839650 ] R10: 0000000000000001 R11: 000000000000001f R12: ffff9a4625d4da80
+[ 406.840055 ] R13: ffff9a4625d00000 R14: ffffffffc0e2eb20 R15: 0000000000000000
+[ 406.840451 ] FS: 00007f0a264ffb80(0000) GS:ffff9a4e2d500000(0000) knlGS:0000000000000000
+[ 406.840851 ] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 406.841125 ] CR2: ffffe7b487148008 CR3: 000000018c4d2000 CR4: 00000000000006f0
+[ 406.841464 ] Call Trace:
+[ 406.841583 ] <TASK>
+[ 406.841682 ] ? __die+0x1f/0x70
+[ 406.841828 ] ? page_fault_oops+0x159/0x470
+[ 406.842014 ] ? fixup_exception+0x22/0x310
+[ 406.842198 ] ? exc_page_fault+0x1ed/0x200
+[ 406.842382 ] ? asm_exc_page_fault+0x22/0x30
+[ 406.842574 ] ? bch2_fs_release+0x54/0x280 [bcachefs]
+[ 406.842842 ] ? kfree+0x62/0x140
+[ 406.842988 ] ? kfree+0x104/0x140
+[ 406.843138 ] bch2_fs_release+0x54/0x280 [bcachefs]
+[ 406.843390 ] kobject_put+0xb7/0x170
+[ 406.843552 ] deactivate_locked_super+0x2f/0xa0
+[ 406.843756 ] cleanup_mnt+0xba/0x150
+[ 406.843917 ] task_work_run+0x59/0xa0
+[ 406.844083 ] exit_to_user_mode_prepare+0x197/0x1a0
+[ 406.844302 ] syscall_exit_to_user_mode+0x16/0x40
+[ 406.844510 ] do_syscall_64+0x4e/0xf0
+[ 406.844675 ] entry_SYSCALL_64_after_hwframe+0x6e/0x76
+[ 406.844907 ] RIP: 0033:0x7f0a2664e4fb
+
+Signed-off-by: Su Yue <glass.su@suse.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/snapshot.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/bcachefs/snapshot.c
++++ b/fs/bcachefs/snapshot.c
+@@ -1709,5 +1709,5 @@ int bch2_snapshots_read(struct bch_fs *c
+
+ void bch2_fs_snapshots_exit(struct bch_fs *c)
+ {
+- kfree(rcu_dereference_protected(c->snapshots, true));
++ kvfree(rcu_dereference_protected(c->snapshots, true));
+ }
--- /dev/null
+From 7b508b323b2ec45be59769bd4e4aeba729c52cf6 Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Thu, 1 Feb 2024 21:01:02 -0500
+Subject: bcachefs: time_stats: Check for last_event == 0 when updating freq stats
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+commit 7b508b323b2ec45be59769bd4e4aeba729c52cf6 upstream.
+
+This fixes spurious outliers in the frequency stats.
+
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/util.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/fs/bcachefs/util.c
++++ b/fs/bcachefs/util.c
+@@ -362,14 +362,15 @@ static inline void bch2_time_stats_updat
+ bch2_quantiles_update(&stats->quantiles, duration);
+ }
+
+- if (time_after64(end, stats->last_event)) {
++ if (stats->last_event && time_after64(end, stats->last_event)) {
+ freq = end - stats->last_event;
+ mean_and_variance_update(&stats->freq_stats, freq);
+ mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
+ stats->max_freq = max(stats->max_freq, freq);
+ stats->min_freq = min(stats->min_freq, freq);
+- stats->last_event = end;
+ }
++
++ stats->last_event = end;
+ }
+
+ static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
--- /dev/null
+From 6bb3f7f4c3f4da8e09de188f2f63e8f741bba3bd Mon Sep 17 00:00:00 2001
+From: Guoyu Ou <benogy@gmail.com>
+Date: Sun, 28 Jan 2024 16:46:17 +0800
+Subject: bcachefs: unlock parent dir if entry is not found in subvolume deletion
+
+From: Guoyu Ou <benogy@gmail.com>
+
+commit 6bb3f7f4c3f4da8e09de188f2f63e8f741bba3bd upstream.
+
+Parent dir is locked by user_path_locked_at() before validating the
+required dentry. It should be unlocked if we can not perform the
+deletion.
+
+This fixes the problem:
+
+$ bcachefs subvolume delete not-exist-entry
+BCH_IOCTL_SUBVOLUME_DESTROY ioctl error: No such file or directory
+$ bcachefs subvolume delete not-exist-entry
+
+the second will stuck because the parent dir is locked in the previous
+deletion.
+
+Signed-off-by: Guoyu Ou <benogy@gmail.com>
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/fs-ioctl.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/bcachefs/fs-ioctl.c
++++ b/fs/bcachefs/fs-ioctl.c
+@@ -463,6 +463,7 @@ static long bch2_ioctl_subvolume_destroy
+ if (IS_ERR(victim))
+ return PTR_ERR(victim);
+
++ dir = d_inode(path.dentry);
+ if (victim->d_sb->s_fs_info != c) {
+ ret = -EXDEV;
+ goto err;
+@@ -471,14 +472,13 @@ static long bch2_ioctl_subvolume_destroy
+ ret = -ENOENT;
+ goto err;
+ }
+- dir = d_inode(path.dentry);
+ ret = __bch2_unlink(dir, victim, true);
+ if (!ret) {
+ fsnotify_rmdir(dir, victim);
+ d_delete(victim);
+ }
+- inode_unlock(dir);
+ err:
++ inode_unlock(dir);
+ dput(victim);
+ path_put(&path);
+ return ret;
bcachefs-don-t-pass-memcmp-as-a-pointer.patch
bcachefs-rebalance-should-wakeup-on-shutdown-if-disabled.patch
bcachefs-add-missing-bch2_moving_ctxt_flush_all.patch
+bcachefs-bch2_kthread_io_clock_wait-no-longer-sleeps-until-full-amount.patch
+bcachefs-kvfree-bch_fs-snapshots-in-bch2_fs_snapshots_exit.patch
+bcachefs-grab-s_umount-only-if-snapshotting.patch
+bcachefs-fix-incorrect-usage-of-req_op_flush.patch
+bcachefs-unlock-parent-dir-if-entry-is-not-found-in-subvolume-deletion.patch
+bcachefs-time_stats-check-for-last_event-0-when-updating-freq-stats.patch