--- /dev/null
+From ee64e00984ec3ea3fbf9b4331cbd073690ff8765 Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Sun, 8 Sep 2024 01:06:57 -0400
+Subject: bcachefs: Don't delete open files in online fsck
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+[ Upstream commit 16005147cca41a0f67b5def2a4656286f8c0db4a ]
+
+If a file is unlinked but still open, we don't want online fsck to
+delete it - or fun inconsistencies will happen.
+
+https://github.com/koverstreet/bcachefs/issues/727
+
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/fs.c | 8 ++++++++
+ fs/bcachefs/fs.h | 7 +++++++
+ fs/bcachefs/fsck.c | 18 ++++++++++++++++++
+ 3 files changed, 33 insertions(+)
+
+--- a/fs/bcachefs/fs.c
++++ b/fs/bcachefs/fs.c
+@@ -177,6 +177,14 @@ static unsigned bch2_inode_hash(subvol_i
+ return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+ }
+
++struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
++{
++ return to_bch_ei(ilookup5_nowait(c->vfs_sb,
++ bch2_inode_hash(inum),
++ bch2_iget5_test,
++ &inum));
++}
++
+ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
+ {
+ subvol_inum inum = inode_inum(inode);
+--- a/fs/bcachefs/fs.h
++++ b/fs/bcachefs/fs.h
+@@ -56,6 +56,8 @@ static inline subvol_inum inode_inum(str
+ };
+ }
+
++struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
++
+ /*
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+ * btree inode may be inconsistent:
+@@ -194,6 +196,11 @@ int bch2_vfs_init(void);
+
+ #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
+
++static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
++{
++ return NULL;
++}
++
+ static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+ snapshot_id_list *s) {}
+ static inline void bch2_vfs_exit(void) {}
+--- a/fs/bcachefs/fsck.c
++++ b/fs/bcachefs/fsck.c
+@@ -8,6 +8,7 @@
+ #include "darray.h"
+ #include "dirent.h"
+ #include "error.h"
++#include "fs.h"
+ #include "fs-common.h"
+ #include "fsck.h"
+ #include "inode.h"
+@@ -948,6 +949,22 @@ fsck_err:
+ return ret;
+ }
+
++static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
++{
++ subvol_inum inum = {
++ .subvol = snapshot_t(c, p.snapshot)->subvol,
++ .inum = p.offset,
++ };
++
++ /* snapshot tree corruption, can't safely delete */
++ if (!inum.subvol) {
++ bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot);
++ return true;
++ }
++
++ return __bch2_inode_hash_find(c, inum) != NULL;
++}
++
+ static int check_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+@@ -1025,6 +1042,7 @@ static int check_inode(struct btree_tran
+ }
+
+ if (u.bi_flags & BCH_INODE_unlinked &&
++ !bch2_inode_open(c, k.k->p) &&
+ (!c->sb.clean ||
+ fsck_err(c, inode_unlinked_but_clean,
+ "filesystem marked clean, but inode %llu unlinked",
--- /dev/null
+From d02c97848ab5b81489f59bfb64771a30134a7843 Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Mon, 26 Aug 2024 19:11:00 -0400
+Subject: bcachefs: Fix bch2_extents_match() false positive
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+[ Upstream commit d26935690c03fe8159d42358bed1c56252700cd1 ]
+
+This was caught as a very rare nonce inconsistency, on systems with
+encryption and replication (and tiering, or some form of rebalance
+operation running):
+
+[Wed Jul 17 13:30:03 2024] about to insert invalid key in data update path
+[Wed Jul 17 13:30:03 2024] old: u64s 10 type extent 671283510:6392:U32_MAX len 16 ver 106595503: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:104 gen 7 ptr: 4:513244:48 gen 6 rebalance: target hdd compression zstd
+[Wed Jul 17 13:30:03 2024] k: u64s 10 type extent 671283510:6400:U32_MAX len 16 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 ptr: 4:513244:56 gen 6 rebalance: target hdd compression zstd
+[Wed Jul 17 13:30:03 2024] new: u64s 14 type extent 671283510:6392:U32_MAX len 8 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 cached ptr: 4:513244:56 gen 6 cached rebalance: target hdd compression zstd crc: c_size 8 size 16 offset 8 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 1:10860085:32 gen 0 ptr: 0:17285918:408 gen 0
+[Wed Jul 17 13:30:03 2024] bcachefs (cca5bc65-fe77-409d-a9fa-465a6e7f4eae): fatal error - emergency read only
+
+bch2_extents_match() was reporting true for extents that did not
+actually point to the same data.
+
+bch2_extent_match() iterates over pairs of pointers, looking for
+pointers that point to the same location on disk (with matching
+generation numbers). However one or both extents may have been trimmed
+(or merged) and they might not have the same disk offset: it corrects
+for this by subtracting the key offset and the checksum entry offset.
+
+However, this failed when an extent was immediately partially
+overwritten, and the new overwrite was allocated the next adjacent disk
+space.
+
+Normally, with compression off, this would never cause a bug, since the
+new extent would have to be immediately after the old extent for the
+pointer offsets to match, and the rebalance index update path is not
+looking for an extent outside the range of the extent it moved.
+
+However with compression enabled, extents take up less space on disk
+than they do in the btree index space - and spuriously matching after
+partial overwrite is possible.
+
+To fix this, add a secondary check, that strictly checks that the
+regions pointed to on disk overlap.
+
+https://github.com/koverstreet/bcachefs/issues/717
+
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/extents.c | 23 ++++++++++++++++++++++-
+ 1 file changed, 22 insertions(+), 1 deletion(-)
+
+--- a/fs/bcachefs/extents.c
++++ b/fs/bcachefs/extents.c
+@@ -932,8 +932,29 @@ bool bch2_extents_match(struct bkey_s_c
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
++
++ /*
++ * This checks that the two pointers point
++ * to the same region on disk - adjusting
++ * for the difference in where the extents
++ * start, since one may have been trimmed:
++ */
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+- (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
++ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
++
++ /*
++ * This additionally checks that the
++ * extents overlap on disk, since the
++ * previous check may trigger spuriously
++ * when one extent is immediately partially
++ * overwritten with another extent (so that
++ * on disk they are adjacent) and
++ * compression is in use:
++ */
++ ((p1.ptr.offset >= p2.ptr.offset &&
++ p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
++ (p2.ptr.offset >= p1.ptr.offset &&
++ p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
+ return true;
+
+ return false;
--- /dev/null
+From 6700f5b9af27d3feea1162fa43c4861e98f8f021 Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Sat, 31 Aug 2024 17:44:51 -0400
+Subject: bcachefs: Revert lockless buffered IO path
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+[ Upstream commit e3e6940940910c2287fe962bdf72015efd4fee81 ]
+
+We had a report of data corruption on nixos when building installer
+images.
+
+https://github.com/NixOS/nixpkgs/pull/321055#issuecomment-2184131334
+
+It seems that writes are being dropped, but only when issued by QEMU,
+and possibly only in snapshot mode. It's undetermined if it's write
+calls are being dropped or dirty folios.
+
+Further testing, via minimizing the original patch to just the change
+that skips the inode lock on non appends/truncates, reveals that it
+really is just not taking the inode lock that causes the corruption: it
+has nothing to do with the other logic changes for preserving write
+atomicity in corner cases.
+
+It's also kernel config dependent: it doesn't reproduce with the minimal
+kernel config that ktest uses, but it does reproduce with nixos's distro
+config. Bisection the kernel config initially pointer the finger at page
+migration or compaction, but it appears that was erroneous; we haven't
+yet determined what kernel config option actually triggers it.
+
+Sadly it appears this will have to be reverted since we're getting too
+close to release and my plate is full, but we'd _really_ like to fully
+debug it.
+
+My suspicion is that this patch is exposing a preexisting bug - the
+inode lock actually covers very little in IO paths, and we have a
+different lock (the pagecache add lock) that guards against races with
+truncate here.
+
+Fixes: 7e64c86cdc6c ("bcachefs: Buffered write path now can avoid the inode lock")
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/fs-io-buffered.c | 149 +++++++++++--------------------------------
+ 1 file changed, 40 insertions(+), 109 deletions(-)
+
+--- a/fs/bcachefs/fs-io-buffered.c
++++ b/fs/bcachefs/fs-io-buffered.c
+@@ -802,8 +802,7 @@ static noinline void folios_trunc(folios
+ static int __bch2_buffered_write(struct bch_inode_info *inode,
+ struct address_space *mapping,
+ struct iov_iter *iter,
+- loff_t pos, unsigned len,
+- bool inode_locked)
++ loff_t pos, unsigned len)
+ {
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_folio_reservation res;
+@@ -828,15 +827,6 @@ static int __bch2_buffered_write(struct
+
+ BUG_ON(!fs.nr);
+
+- /*
+- * If we're not using the inode lock, we need to lock all the folios for
+- * atomiticity of writes vs. other writes:
+- */
+- if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
+- ret = -BCH_ERR_need_inode_lock;
+- goto out;
+- }
+-
+ f = darray_first(fs);
+ if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+ ret = bch2_read_single_folio(f, mapping);
+@@ -931,10 +921,8 @@ static int __bch2_buffered_write(struct
+ end = pos + copied;
+
+ spin_lock(&inode->v.i_lock);
+- if (end > inode->v.i_size) {
+- BUG_ON(!inode_locked);
++ if (end > inode->v.i_size)
+ i_size_write(&inode->v, end);
+- }
+ spin_unlock(&inode->v.i_lock);
+
+ f_pos = pos;
+@@ -978,68 +966,12 @@ static ssize_t bch2_buffered_write(struc
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct bch_inode_info *inode = file_bch_inode(file);
+- loff_t pos;
+- bool inode_locked = false;
+- ssize_t written = 0, written2 = 0, ret = 0;
+-
+- /*
+- * We don't take the inode lock unless i_size will be changing. Folio
+- * locks provide exclusion with other writes, and the pagecache add lock
+- * provides exclusion with truncate and hole punching.
+- *
+- * There is one nasty corner case where atomicity would be broken
+- * without great care: when copying data from userspace to the page
+- * cache, we do that with faults disable - a page fault would recurse
+- * back into the filesystem, taking filesystem locks again, and
+- * deadlock; so it's done with faults disabled, and we fault in the user
+- * buffer when we aren't holding locks.
+- *
+- * If we do part of the write, but we then race and in the userspace
+- * buffer have been evicted and are no longer resident, then we have to
+- * drop our folio locks to re-fault them in, breaking write atomicity.
+- *
+- * To fix this, we restart the write from the start, if we weren't
+- * holding the inode lock.
+- *
+- * There is another wrinkle after that; if we restart the write from the
+- * start, and then get an unrecoverable error, we _cannot_ claim to
+- * userspace that we did not write data we actually did - so we must
+- * track (written2) the most we ever wrote.
+- */
+-
+- if ((iocb->ki_flags & IOCB_APPEND) ||
+- (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
+- inode_lock(&inode->v);
+- inode_locked = true;
+- }
+-
+- ret = generic_write_checks(iocb, iter);
+- if (ret <= 0)
+- goto unlock;
+-
+- ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
+- if (ret) {
+- if (!inode_locked) {
+- inode_lock(&inode->v);
+- inode_locked = true;
+- ret = file_remove_privs_flags(file, 0);
+- }
+- if (ret)
+- goto unlock;
+- }
+-
+- ret = file_update_time(file);
+- if (ret)
+- goto unlock;
+-
+- pos = iocb->ki_pos;
++ loff_t pos = iocb->ki_pos;
++ ssize_t written = 0;
++ int ret = 0;
+
+ bch2_pagecache_add_get(inode);
+
+- if (!inode_locked &&
+- (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
+- goto get_inode_lock;
+-
+ do {
+ unsigned offset = pos & (PAGE_SIZE - 1);
+ unsigned bytes = iov_iter_count(iter);
+@@ -1064,17 +996,12 @@ again:
+ }
+ }
+
+- if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
+- goto get_inode_lock;
+-
+ if (unlikely(fatal_signal_pending(current))) {
+ ret = -EINTR;
+ break;
+ }
+
+- ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
+- if (ret == -BCH_ERR_need_inode_lock)
+- goto get_inode_lock;
++ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+ if (unlikely(ret < 0))
+ break;
+
+@@ -1095,46 +1022,50 @@ again:
+ }
+ pos += ret;
+ written += ret;
+- written2 = max(written, written2);
+-
+- if (ret != bytes && !inode_locked)
+- goto get_inode_lock;
+ ret = 0;
+
+ balance_dirty_pages_ratelimited(mapping);
+-
+- if (0) {
+-get_inode_lock:
+- bch2_pagecache_add_put(inode);
+- inode_lock(&inode->v);
+- inode_locked = true;
+- bch2_pagecache_add_get(inode);
+-
+- iov_iter_revert(iter, written);
+- pos -= written;
+- written = 0;
+- ret = 0;
+- }
+ } while (iov_iter_count(iter));
+- bch2_pagecache_add_put(inode);
+-unlock:
+- if (inode_locked)
+- inode_unlock(&inode->v);
+
+- iocb->ki_pos += written;
++ bch2_pagecache_add_put(inode);
+
+- ret = max(written, written2) ?: ret;
+- if (ret > 0)
+- ret = generic_write_sync(iocb, ret);
+- return ret;
++ return written ? written : ret;
+ }
+
+-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ {
+- ssize_t ret = iocb->ki_flags & IOCB_DIRECT
+- ? bch2_direct_write(iocb, iter)
+- : bch2_buffered_write(iocb, iter);
++ struct file *file = iocb->ki_filp;
++ struct bch_inode_info *inode = file_bch_inode(file);
++ ssize_t ret;
++
++ if (iocb->ki_flags & IOCB_DIRECT) {
++ ret = bch2_direct_write(iocb, from);
++ goto out;
++ }
+
++ inode_lock(&inode->v);
++
++ ret = generic_write_checks(iocb, from);
++ if (ret <= 0)
++ goto unlock;
++
++ ret = file_remove_privs(file);
++ if (ret)
++ goto unlock;
++
++ ret = file_update_time(file);
++ if (ret)
++ goto unlock;
++
++ ret = bch2_buffered_write(iocb, from);
++ if (likely(ret > 0))
++ iocb->ki_pos += ret;
++unlock:
++ inode_unlock(&inode->v);
++
++ if (ret > 0)
++ ret = generic_write_sync(iocb, ret);
++out:
+ return bch2_err_class(ret);
+ }
+