From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 13 Sep 2024 12:47:46 +0000 (+0200)
Subject: 6.10-stable patches
X-Git-Tag: v6.1.111~39
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2369c1c7cf68a572a8181392b3c75fa62f3029bc;p=thirdparty%2Fkernel%2Fstable-queue.git

6.10-stable patches

added patches:
	bcachefs-don-t-delete-open-files-in-online-fsck.patch
	bcachefs-fix-bch2_extents_match-false-positive.patch
	bcachefs-revert-lockless-buffered-io-path.patch
---

diff --git a/queue-6.10/bcachefs-don-t-delete-open-files-in-online-fsck.patch b/queue-6.10/bcachefs-don-t-delete-open-files-in-online-fsck.patch
new file mode 100644
index 00000000000..6e0a0a51f92
--- /dev/null
+++ b/queue-6.10/bcachefs-don-t-delete-open-files-in-online-fsck.patch
@@ -0,0 +1,103 @@
+From ee64e00984ec3ea3fbf9b4331cbd073690ff8765 Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Sun, 8 Sep 2024 01:06:57 -0400
+Subject: bcachefs: Don't delete open files in online fsck
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+[ Upstream commit 16005147cca41a0f67b5def2a4656286f8c0db4a ]
+
+If a file is unlinked but still open, we don't want online fsck to
+delete it - or fun inconsistencies will happen.
+
+https://github.com/koverstreet/bcachefs/issues/727
+
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/fs.c   |    8 ++++++++
+ fs/bcachefs/fs.h   |    7 +++++++
+ fs/bcachefs/fsck.c |   18 ++++++++++++++++++
+ 3 files changed, 33 insertions(+)
+
+--- a/fs/bcachefs/fs.c
++++ b/fs/bcachefs/fs.c
+@@ -177,6 +177,14 @@ static unsigned bch2_inode_hash(subvol_i
+ 	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+ }
+ 
++struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
++{
++	return to_bch_ei(ilookup5_nowait(c->vfs_sb,
++					 bch2_inode_hash(inum),
++					 bch2_iget5_test,
++					 &inum));
++}
++
+ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
+ {
+ 	subvol_inum inum = inode_inum(inode);
+--- a/fs/bcachefs/fs.h
++++ b/fs/bcachefs/fs.h
+@@ -56,6 +56,8 @@ static inline subvol_inum inode_inum(str
+ 	};
+ }
+ 
++struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
++
+ /*
+  * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+  * btree inode may be inconsistent:
+@@ -194,6 +196,11 @@ int bch2_vfs_init(void);
+ 
+ #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
+ 
++static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
++{
++	return NULL;
++}
++
+ static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+ 					       snapshot_id_list *s) {}
+ static inline void bch2_vfs_exit(void) {}
+--- a/fs/bcachefs/fsck.c
++++ b/fs/bcachefs/fsck.c
+@@ -8,6 +8,7 @@
+ #include "darray.h"
+ #include "dirent.h"
+ #include "error.h"
++#include "fs.h"
+ #include "fs-common.h"
+ #include "fsck.h"
+ #include "inode.h"
+@@ -948,6 +949,22 @@ fsck_err:
+ 	return ret;
+ }
+ 
++static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
++{
++	subvol_inum inum = {
++		.subvol = snapshot_t(c, p.snapshot)->subvol,
++		.inum	= p.offset,
++	};
++
++	/* snapshot tree corruption, can't safely delete */
++	if (!inum.subvol) {
++		bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot);
++		return true;
++	}
++
++	return __bch2_inode_hash_find(c, inum) != NULL;
++}
++
+ static int check_inode(struct btree_trans *trans,
+ 		       struct btree_iter *iter,
+ 		       struct bkey_s_c k,
+@@ -1025,6 +1042,7 @@ static int check_inode(struct btree_tran
+ 	}
+ 
+ 	if (u.bi_flags & BCH_INODE_unlinked &&
++	    !bch2_inode_open(c, k.k->p) &&
+ 	    (!c->sb.clean ||
+ 	     fsck_err(c, inode_unlinked_but_clean,
+ 		      "filesystem marked clean, but inode %llu unlinked",
diff --git a/queue-6.10/bcachefs-fix-bch2_extents_match-false-positive.patch b/queue-6.10/bcachefs-fix-bch2_extents_match-false-positive.patch
new file mode 100644
index 00000000000..cb974cfc85a
--- /dev/null
+++ b/queue-6.10/bcachefs-fix-bch2_extents_match-false-positive.patch
@@ -0,0 +1,85 @@
+From d02c97848ab5b81489f59bfb64771a30134a7843 Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Mon, 26 Aug 2024 19:11:00 -0400
+Subject: bcachefs: Fix bch2_extents_match() false positive
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+[ Upstream commit  d26935690c03fe8159d42358bed1c56252700cd1 ]
+
+This was caught as a very rare nonce inconsistency, on systems with
+encryption and replication (and tiering, or some form of rebalance
+operation running):
+
+[Wed Jul 17 13:30:03 2024] about to insert invalid key in data update path
+[Wed Jul 17 13:30:03 2024] old: u64s 10 type extent 671283510:6392:U32_MAX len 16 ver 106595503: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:104 gen 7 ptr: 4:513244:48 gen 6 rebalance: target hdd compression zstd
+[Wed Jul 17 13:30:03 2024] k:   u64s 10 type extent 671283510:6400:U32_MAX len 16 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 ptr: 4:513244:56 gen 6 rebalance: target hdd compression zstd
+[Wed Jul 17 13:30:03 2024] new: u64s 14 type extent 671283510:6392:U32_MAX len 8 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 cached ptr: 4:513244:56 gen 6 cached rebalance: target hdd compression zstd crc: c_size 8 size 16 offset 8 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 1:10860085:32 gen 0 ptr: 0:17285918:408 gen 0
+[Wed Jul 17 13:30:03 2024] bcachefs (cca5bc65-fe77-409d-a9fa-465a6e7f4eae): fatal error - emergency read only
+
+bch2_extents_match() was reporting true for extents that did not
+actually point to the same data.
+
+bch2_extent_match() iterates over pairs of pointers, looking for
+pointers that point to the same location on disk (with matching
+generation numbers). However one or both extents may have been trimmed
+(or merged) and they might not have the same disk offset: it corrects
+for this by subtracting the key offset and the checksum entry offset.
+
+However, this failed when an extent was immediately partially
+overwritten, and the new overwrite was allocated the next adjacent disk
+space.
+
+Normally, with compression off, this would never cause a bug, since the
+new extent would have to be immediately after the old extent for the
+pointer offsets to match, and the rebalance index update path is not
+looking for an extent outside the range of the extent it moved.
+
+However with compression enabled, extents take up less space on disk
+than they do in the btree index space - and spuriously matching after
+partial overwrite is possible.
+
+To fix this, add a secondary check, that strictly checks that the
+regions pointed to on disk overlap.
+
+https://github.com/koverstreet/bcachefs/issues/717
+
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/extents.c |   23 ++++++++++++++++++++++-
+ 1 file changed, 22 insertions(+), 1 deletion(-)
+
+--- a/fs/bcachefs/extents.c
++++ b/fs/bcachefs/extents.c
+@@ -932,8 +932,29 @@ bool bch2_extents_match(struct bkey_s_c
+ 			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ 				if (p1.ptr.dev		== p2.ptr.dev &&
+ 				    p1.ptr.gen		== p2.ptr.gen &&
++
++				    /*
++				     * This checks that the two pointers point
++				     * to the same region on disk - adjusting
++				     * for the difference in where the extents
++				     * start, since one may have been trimmed:
++				     */
+ 				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+-				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
++				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
++
++				    /*
++				     * This additionally checks that the
++				     * extents overlap on disk, since the
++				     * previous check may trigger spuriously
++				     * when one extent is immediately partially
++				     * overwritten with another extent (so that
++				     * on disk they are adjacent) and
++				     * compression is in use:
++				     */
++				    ((p1.ptr.offset >= p2.ptr.offset &&
++				      p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
++				     (p2.ptr.offset >= p1.ptr.offset &&
++				      p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
+ 					return true;
+ 
+ 		return false;
diff --git a/queue-6.10/bcachefs-revert-lockless-buffered-io-path.patch b/queue-6.10/bcachefs-revert-lockless-buffered-io-path.patch
new file mode 100644
index 00000000000..21d548c049e
--- /dev/null
+++ b/queue-6.10/bcachefs-revert-lockless-buffered-io-path.patch
@@ -0,0 +1,258 @@
+From 6700f5b9af27d3feea1162fa43c4861e98f8f021 Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Sat, 31 Aug 2024 17:44:51 -0400
+Subject: bcachefs: Revert lockless buffered IO path
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+[ Upstream commit  e3e6940940910c2287fe962bdf72015efd4fee81 ]
+
+We had a report of data corruption on nixos when building installer
+images.
+
+https://github.com/NixOS/nixpkgs/pull/321055#issuecomment-2184131334
+
+It seems that writes are being dropped, but only when issued by QEMU,
+and possibly only in snapshot mode. It's undetermined if it's write
+calls are being dropped or dirty folios.
+
+Further testing, via minimizing the original patch to just the change
+that skips the inode lock on non appends/truncates, reveals that it
+really is just not taking the inode lock that causes the corruption: it
+has nothing to do with the other logic changes for preserving write
+atomicity in corner cases.
+
+It's also kernel config dependent: it doesn't reproduce with the minimal
+kernel config that ktest uses, but it does reproduce with nixos's distro
+config. Bisection the kernel config initially pointer the finger at page
+migration or compaction, but it appears that was erroneous; we haven't
+yet determined what kernel config option actually triggers it.
+
+Sadly it appears this will have to be reverted since we're getting too
+close to release and my plate is full, but we'd _really_ like to fully
+debug it.
+
+My suspicion is that this patch is exposing a preexisting bug - the
+inode lock actually covers very little in IO paths, and we have a
+different lock (the pagecache add lock) that guards against races with
+truncate here.
+
+Fixes: 7e64c86cdc6c ("bcachefs: Buffered write path now can avoid the inode lock")
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/fs-io-buffered.c |  149 +++++++++++--------------------------------
+ 1 file changed, 40 insertions(+), 109 deletions(-)
+
+--- a/fs/bcachefs/fs-io-buffered.c
++++ b/fs/bcachefs/fs-io-buffered.c
+@@ -802,8 +802,7 @@ static noinline void folios_trunc(folios
+ static int __bch2_buffered_write(struct bch_inode_info *inode,
+ 				 struct address_space *mapping,
+ 				 struct iov_iter *iter,
+-				 loff_t pos, unsigned len,
+-				 bool inode_locked)
++				 loff_t pos, unsigned len)
+ {
+ 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ 	struct bch2_folio_reservation res;
+@@ -828,15 +827,6 @@ static int __bch2_buffered_write(struct
+ 
+ 	BUG_ON(!fs.nr);
+ 
+-	/*
+-	 * If we're not using the inode lock, we need to lock all the folios for
+-	 * atomiticity of writes vs. other writes:
+-	 */
+-	if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
+-		ret = -BCH_ERR_need_inode_lock;
+-		goto out;
+-	}
+-
+ 	f = darray_first(fs);
+ 	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+ 		ret = bch2_read_single_folio(f, mapping);
+@@ -931,10 +921,8 @@ static int __bch2_buffered_write(struct
+ 	end = pos + copied;
+ 
+ 	spin_lock(&inode->v.i_lock);
+-	if (end > inode->v.i_size) {
+-		BUG_ON(!inode_locked);
++	if (end > inode->v.i_size)
+ 		i_size_write(&inode->v, end);
+-	}
+ 	spin_unlock(&inode->v.i_lock);
+ 
+ 	f_pos = pos;
+@@ -978,68 +966,12 @@ static ssize_t bch2_buffered_write(struc
+ 	struct file *file = iocb->ki_filp;
+ 	struct address_space *mapping = file->f_mapping;
+ 	struct bch_inode_info *inode = file_bch_inode(file);
+-	loff_t pos;
+-	bool inode_locked = false;
+-	ssize_t written = 0, written2 = 0, ret = 0;
+-
+-	/*
+-	 * We don't take the inode lock unless i_size will be changing. Folio
+-	 * locks provide exclusion with other writes, and the pagecache add lock
+-	 * provides exclusion with truncate and hole punching.
+-	 *
+-	 * There is one nasty corner case where atomicity would be broken
+-	 * without great care: when copying data from userspace to the page
+-	 * cache, we do that with faults disable - a page fault would recurse
+-	 * back into the filesystem, taking filesystem locks again, and
+-	 * deadlock; so it's done with faults disabled, and we fault in the user
+-	 * buffer when we aren't holding locks.
+-	 *
+-	 * If we do part of the write, but we then race and in the userspace
+-	 * buffer have been evicted and are no longer resident, then we have to
+-	 * drop our folio locks to re-fault them in, breaking write atomicity.
+-	 *
+-	 * To fix this, we restart the write from the start, if we weren't
+-	 * holding the inode lock.
+-	 *
+-	 * There is another wrinkle after that; if we restart the write from the
+-	 * start, and then get an unrecoverable error, we _cannot_ claim to
+-	 * userspace that we did not write data we actually did - so we must
+-	 * track (written2) the most we ever wrote.
+-	 */
+-
+-	if ((iocb->ki_flags & IOCB_APPEND) ||
+-	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
+-		inode_lock(&inode->v);
+-		inode_locked = true;
+-	}
+-
+-	ret = generic_write_checks(iocb, iter);
+-	if (ret <= 0)
+-		goto unlock;
+-
+-	ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
+-	if (ret) {
+-		if (!inode_locked) {
+-			inode_lock(&inode->v);
+-			inode_locked = true;
+-			ret = file_remove_privs_flags(file, 0);
+-		}
+-		if (ret)
+-			goto unlock;
+-	}
+-
+-	ret = file_update_time(file);
+-	if (ret)
+-		goto unlock;
+-
+-	pos = iocb->ki_pos;
++	loff_t pos = iocb->ki_pos;
++	ssize_t written = 0;
++	int ret = 0;
+ 
+ 	bch2_pagecache_add_get(inode);
+ 
+-	if (!inode_locked &&
+-	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
+-		goto get_inode_lock;
+-
+ 	do {
+ 		unsigned offset = pos & (PAGE_SIZE - 1);
+ 		unsigned bytes = iov_iter_count(iter);
+@@ -1064,17 +996,12 @@ again:
+ 			}
+ 		}
+ 
+-		if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
+-			goto get_inode_lock;
+-
+ 		if (unlikely(fatal_signal_pending(current))) {
+ 			ret = -EINTR;
+ 			break;
+ 		}
+ 
+-		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
+-		if (ret == -BCH_ERR_need_inode_lock)
+-			goto get_inode_lock;
++		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+ 		if (unlikely(ret < 0))
+ 			break;
+ 
+@@ -1095,46 +1022,50 @@ again:
+ 		}
+ 		pos += ret;
+ 		written += ret;
+-		written2 = max(written, written2);
+-
+-		if (ret != bytes && !inode_locked)
+-			goto get_inode_lock;
+ 		ret = 0;
+ 
+ 		balance_dirty_pages_ratelimited(mapping);
+-
+-		if (0) {
+-get_inode_lock:
+-			bch2_pagecache_add_put(inode);
+-			inode_lock(&inode->v);
+-			inode_locked = true;
+-			bch2_pagecache_add_get(inode);
+-
+-			iov_iter_revert(iter, written);
+-			pos -= written;
+-			written = 0;
+-			ret = 0;
+-		}
+ 	} while (iov_iter_count(iter));
+-	bch2_pagecache_add_put(inode);
+-unlock:
+-	if (inode_locked)
+-		inode_unlock(&inode->v);
+ 
+-	iocb->ki_pos += written;
++	bch2_pagecache_add_put(inode);
+ 
+-	ret = max(written, written2) ?: ret;
+-	if (ret > 0)
+-		ret = generic_write_sync(iocb, ret);
+-	return ret;
++	return written ? written : ret;
+ }
+ 
+-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ {
+-	ssize_t ret = iocb->ki_flags & IOCB_DIRECT
+-		? bch2_direct_write(iocb, iter)
+-		: bch2_buffered_write(iocb, iter);
++	struct file *file = iocb->ki_filp;
++	struct bch_inode_info *inode = file_bch_inode(file);
++	ssize_t ret;
++
++	if (iocb->ki_flags & IOCB_DIRECT) {
++		ret = bch2_direct_write(iocb, from);
++		goto out;
++	}
+ 
++	inode_lock(&inode->v);
++
++	ret = generic_write_checks(iocb, from);
++	if (ret <= 0)
++		goto unlock;
++
++	ret = file_remove_privs(file);
++	if (ret)
++		goto unlock;
++
++	ret = file_update_time(file);
++	if (ret)
++		goto unlock;
++
++	ret = bch2_buffered_write(iocb, from);
++	if (likely(ret > 0))
++		iocb->ki_pos += ret;
++unlock:
++	inode_unlock(&inode->v);
++
++	if (ret > 0)
++		ret = generic_write_sync(iocb, ret);
++out:
+ 	return bch2_err_class(ret);
+ }
+ 
diff --git a/queue-6.10/series b/queue-6.10/series
index 7ace2e7237e..653e3066781 100644
--- a/queue-6.10/series
+++ b/queue-6.10/series
@@ -53,3 +53,6 @@ dm-integrity-fix-a-race-condition-when-accessing-recalc_sector.patch
 clocksource-hyper-v-use-lapic-timer-in-a-tdx-vm-without-paravisor.patch
 x86-hyperv-fix-kexec-crash-due-to-vp-assist-page-corruption.patch
 mm-avoid-leaving-partial-pfn-mappings-around-in-error-case.patch
+bcachefs-fix-bch2_extents_match-false-positive.patch
+bcachefs-revert-lockless-buffered-io-path.patch
+bcachefs-don-t-delete-open-files-in-online-fsck.patch