From: Greg Kroah-Hartman Date: Fri, 13 Sep 2024 12:47:46 +0000 (+0200) Subject: 6.10-stable patches X-Git-Tag: v6.1.111~39 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2369c1c7cf68a572a8181392b3c75fa62f3029bc;p=thirdparty%2Fkernel%2Fstable-queue.git 6.10-stable patches added patches: bcachefs-don-t-delete-open-files-in-online-fsck.patch bcachefs-fix-bch2_extents_match-false-positive.patch bcachefs-revert-lockless-buffered-io-path.patch --- diff --git a/queue-6.10/bcachefs-don-t-delete-open-files-in-online-fsck.patch b/queue-6.10/bcachefs-don-t-delete-open-files-in-online-fsck.patch new file mode 100644 index 00000000000..6e0a0a51f92 --- /dev/null +++ b/queue-6.10/bcachefs-don-t-delete-open-files-in-online-fsck.patch @@ -0,0 +1,103 @@ +From ee64e00984ec3ea3fbf9b4331cbd073690ff8765 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sun, 8 Sep 2024 01:06:57 -0400 +Subject: bcachefs: Don't delete open files in online fsck + +From: Kent Overstreet + +[ Upstream commit 16005147cca41a0f67b5def2a4656286f8c0db4a ] + +If a file is unlinked but still open, we don't want online fsck to +delete it - or fun inconsistencies will happen. + +https://github.com/koverstreet/bcachefs/issues/727 + +Signed-off-by: Kent Overstreet +Signed-off-by: Greg Kroah-Hartman +--- + fs/bcachefs/fs.c | 8 ++++++++ + fs/bcachefs/fs.h | 7 +++++++ + fs/bcachefs/fsck.c | 18 ++++++++++++++++++ + 3 files changed, 33 insertions(+) + +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -177,6 +177,14 @@ static unsigned bch2_inode_hash(subvol_i + return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); + } + ++struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) ++{ ++ return to_bch_ei(ilookup5_nowait(c->vfs_sb, ++ bch2_inode_hash(inum), ++ bch2_iget5_test, ++ &inum)); ++} ++ + static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) + { + subvol_inum inum = inode_inum(inode); +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -56,6 +56,8 @@ static inline subvol_inum inode_inum(str + }; + } + ++struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); ++ + /* + * Set if we've gotten a btree error for this inode, and thus the vfs inode and + * btree inode may be inconsistent: +@@ -194,6 +196,11 @@ int bch2_vfs_init(void); + + #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) + ++static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) ++{ ++ return NULL; ++} ++ + static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, + snapshot_id_list *s) {} + static inline void bch2_vfs_exit(void) {} +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -8,6 +8,7 @@ + #include "darray.h" + #include "dirent.h" + #include "error.h" ++#include "fs.h" + #include "fs-common.h" + #include "fsck.h" + #include "inode.h" +@@ -948,6 +949,22 @@ fsck_err: + return ret; + } + ++static bool bch2_inode_open(struct bch_fs *c, struct bpos p) ++{ ++ subvol_inum inum = { ++ .subvol = snapshot_t(c, p.snapshot)->subvol, ++ .inum = p.offset, ++ }; ++ ++ /* snapshot tree corruption, can't safely delete */ ++ if (!inum.subvol) { ++ bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot); ++ return true; ++ } ++ ++ return __bch2_inode_hash_find(c, inum) != NULL; ++} ++ + static int check_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, +@@ -1025,6 +1042,7 @@ static int check_inode(struct btree_tran + } + + if (u.bi_flags & BCH_INODE_unlinked && ++ !bch2_inode_open(c, k.k->p) && + (!c->sb.clean || + fsck_err(c, inode_unlinked_but_clean, + "filesystem marked clean, but inode %llu unlinked", diff --git a/queue-6.10/bcachefs-fix-bch2_extents_match-false-positive.patch b/queue-6.10/bcachefs-fix-bch2_extents_match-false-positive.patch new file mode 100644 index 00000000000..cb974cfc85a --- /dev/null +++ b/queue-6.10/bcachefs-fix-bch2_extents_match-false-positive.patch @@ -0,0 +1,85 @@ +From d02c97848ab5b81489f59bfb64771a30134a7843 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Mon, 26 Aug 2024 19:11:00 -0400 +Subject: bcachefs: Fix bch2_extents_match() false positive + +From: Kent Overstreet + +[ Upstream commit d26935690c03fe8159d42358bed1c56252700cd1 ] + +This was caught as a very rare nonce inconsistency, on systems with +encryption and replication (and tiering, or some form of rebalance +operation running): + +[Wed Jul 17 13:30:03 2024] about to insert invalid key in data update path +[Wed Jul 17 13:30:03 2024] old: u64s 10 type extent 671283510:6392:U32_MAX len 16 ver 106595503: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:104 gen 7 ptr: 4:513244:48 gen 6 rebalance: target hdd compression zstd +[Wed Jul 17 13:30:03 2024] k: u64s 10 type extent 671283510:6400:U32_MAX len 16 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 ptr: 4:513244:56 gen 6 rebalance: target hdd compression zstd +[Wed Jul 17 13:30:03 2024] new: u64s 14 type extent 671283510:6392:U32_MAX len 8 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 cached ptr: 4:513244:56 gen 6 cached rebalance: target hdd compression zstd crc: c_size 8 size 16 offset 8 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 1:10860085:32 gen 0 ptr: 0:17285918:408 gen 0 +[Wed Jul 17 13:30:03 2024] bcachefs (cca5bc65-fe77-409d-a9fa-465a6e7f4eae): fatal error - emergency read only + +bch2_extents_match() was reporting true for extents that did not +actually point to the same data. + +bch2_extent_match() iterates over pairs of pointers, looking for +pointers that point to the same location on disk (with matching +generation numbers). However one or both extents may have been trimmed +(or merged) and they might not have the same disk offset: it corrects +for this by subtracting the key offset and the checksum entry offset. + +However, this failed when an extent was immediately partially +overwritten, and the new overwrite was allocated the next adjacent disk +space. + +Normally, with compression off, this would never cause a bug, since the +new extent would have to be immediately after the old extent for the +pointer offsets to match, and the rebalance index update path is not +looking for an extent outside the range of the extent it moved. + +However with compression enabled, extents take up less space on disk +than they do in the btree index space - and spuriously matching after +partial overwrite is possible. + +To fix this, add a secondary check, that strictly checks that the +regions pointed to on disk overlap. + +https://github.com/koverstreet/bcachefs/issues/717 + +Signed-off-by: Kent Overstreet +Signed-off-by: Greg Kroah-Hartman +--- + fs/bcachefs/extents.c | 23 ++++++++++++++++++++++- + 1 file changed, 22 insertions(+), 1 deletion(-) + +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -932,8 +932,29 @@ bool bch2_extents_match(struct bkey_s_c + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && ++ ++ /* ++ * This checks that the two pointers point ++ * to the same region on disk - adjusting ++ * for the difference in where the extents ++ * start, since one may have been trimmed: ++ */ + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == +- (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) ++ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && ++ ++ /* ++ * This additionally checks that the ++ * extents overlap on disk, since the ++ * previous check may trigger spuriously ++ * when one extent is immediately partially ++ * overwritten with another extent (so that ++ * on disk they are adjacent) and ++ * compression is in use: ++ */ ++ ((p1.ptr.offset >= p2.ptr.offset && ++ p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || ++ (p2.ptr.offset >= p1.ptr.offset && ++ p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) + return true; + + return false; diff --git a/queue-6.10/bcachefs-revert-lockless-buffered-io-path.patch b/queue-6.10/bcachefs-revert-lockless-buffered-io-path.patch new file mode 100644 index 00000000000..21d548c049e --- /dev/null +++ b/queue-6.10/bcachefs-revert-lockless-buffered-io-path.patch @@ -0,0 +1,258 @@ +From 6700f5b9af27d3feea1162fa43c4861e98f8f021 Mon Sep 17 00:00:00 2001 +From: Kent Overstreet +Date: Sat, 31 Aug 2024 17:44:51 -0400 +Subject: bcachefs: Revert lockless buffered IO path + +From: Kent Overstreet + +[ Upstream commit e3e6940940910c2287fe962bdf72015efd4fee81 ] + +We had a report of data corruption on nixos when building installer +images. + +https://github.com/NixOS/nixpkgs/pull/321055#issuecomment-2184131334 + +It seems that writes are being dropped, but only when issued by QEMU, +and possibly only in snapshot mode. It's undetermined if it's write +calls are being dropped or dirty folios. + +Further testing, via minimizing the original patch to just the change +that skips the inode lock on non appends/truncates, reveals that it +really is just not taking the inode lock that causes the corruption: it +has nothing to do with the other logic changes for preserving write +atomicity in corner cases. + +It's also kernel config dependent: it doesn't reproduce with the minimal +kernel config that ktest uses, but it does reproduce with nixos's distro +config. Bisection the kernel config initially pointer the finger at page +migration or compaction, but it appears that was erroneous; we haven't +yet determined what kernel config option actually triggers it. + +Sadly it appears this will have to be reverted since we're getting too +close to release and my plate is full, but we'd _really_ like to fully +debug it. + +My suspicion is that this patch is exposing a preexisting bug - the +inode lock actually covers very little in IO paths, and we have a +different lock (the pagecache add lock) that guards against races with +truncate here. + +Fixes: 7e64c86cdc6c ("bcachefs: Buffered write path now can avoid the inode lock") +Signed-off-by: Kent Overstreet +Signed-off-by: Greg Kroah-Hartman +--- + fs/bcachefs/fs-io-buffered.c | 149 +++++++++++-------------------------------- + 1 file changed, 40 insertions(+), 109 deletions(-) + +--- a/fs/bcachefs/fs-io-buffered.c ++++ b/fs/bcachefs/fs-io-buffered.c +@@ -802,8 +802,7 @@ static noinline void folios_trunc(folios + static int __bch2_buffered_write(struct bch_inode_info *inode, + struct address_space *mapping, + struct iov_iter *iter, +- loff_t pos, unsigned len, +- bool inode_locked) ++ loff_t pos, unsigned len) + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation res; +@@ -828,15 +827,6 @@ static int __bch2_buffered_write(struct + + BUG_ON(!fs.nr); + +- /* +- * If we're not using the inode lock, we need to lock all the folios for +- * atomiticity of writes vs. other writes: +- */ +- if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { +- ret = -BCH_ERR_need_inode_lock; +- goto out; +- } +- + f = darray_first(fs); + if (pos != folio_pos(f) && !folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); +@@ -931,10 +921,8 @@ static int __bch2_buffered_write(struct + end = pos + copied; + + spin_lock(&inode->v.i_lock); +- if (end > inode->v.i_size) { +- BUG_ON(!inode_locked); ++ if (end > inode->v.i_size) + i_size_write(&inode->v, end); +- } + spin_unlock(&inode->v.i_lock); + + f_pos = pos; +@@ -978,68 +966,12 @@ static ssize_t bch2_buffered_write(struc + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); +- loff_t pos; +- bool inode_locked = false; +- ssize_t written = 0, written2 = 0, ret = 0; +- +- /* +- * We don't take the inode lock unless i_size will be changing. Folio +- * locks provide exclusion with other writes, and the pagecache add lock +- * provides exclusion with truncate and hole punching. +- * +- * There is one nasty corner case where atomicity would be broken +- * without great care: when copying data from userspace to the page +- * cache, we do that with faults disable - a page fault would recurse +- * back into the filesystem, taking filesystem locks again, and +- * deadlock; so it's done with faults disabled, and we fault in the user +- * buffer when we aren't holding locks. +- * +- * If we do part of the write, but we then race and in the userspace +- * buffer have been evicted and are no longer resident, then we have to +- * drop our folio locks to re-fault them in, breaking write atomicity. +- * +- * To fix this, we restart the write from the start, if we weren't +- * holding the inode lock. +- * +- * There is another wrinkle after that; if we restart the write from the +- * start, and then get an unrecoverable error, we _cannot_ claim to +- * userspace that we did not write data we actually did - so we must +- * track (written2) the most we ever wrote. +- */ +- +- if ((iocb->ki_flags & IOCB_APPEND) || +- (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { +- inode_lock(&inode->v); +- inode_locked = true; +- } +- +- ret = generic_write_checks(iocb, iter); +- if (ret <= 0) +- goto unlock; +- +- ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); +- if (ret) { +- if (!inode_locked) { +- inode_lock(&inode->v); +- inode_locked = true; +- ret = file_remove_privs_flags(file, 0); +- } +- if (ret) +- goto unlock; +- } +- +- ret = file_update_time(file); +- if (ret) +- goto unlock; +- +- pos = iocb->ki_pos; ++ loff_t pos = iocb->ki_pos; ++ ssize_t written = 0; ++ int ret = 0; + + bch2_pagecache_add_get(inode); + +- if (!inode_locked && +- (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) +- goto get_inode_lock; +- + do { + unsigned offset = pos & (PAGE_SIZE - 1); + unsigned bytes = iov_iter_count(iter); +@@ -1064,17 +996,12 @@ again: + } + } + +- if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) +- goto get_inode_lock; +- + if (unlikely(fatal_signal_pending(current))) { + ret = -EINTR; + break; + } + +- ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); +- if (ret == -BCH_ERR_need_inode_lock) +- goto get_inode_lock; ++ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); + if (unlikely(ret < 0)) + break; + +@@ -1095,46 +1022,50 @@ again: + } + pos += ret; + written += ret; +- written2 = max(written, written2); +- +- if (ret != bytes && !inode_locked) +- goto get_inode_lock; + ret = 0; + + balance_dirty_pages_ratelimited(mapping); +- +- if (0) { +-get_inode_lock: +- bch2_pagecache_add_put(inode); +- inode_lock(&inode->v); +- inode_locked = true; +- bch2_pagecache_add_get(inode); +- +- iov_iter_revert(iter, written); +- pos -= written; +- written = 0; +- ret = 0; +- } + } while (iov_iter_count(iter)); +- bch2_pagecache_add_put(inode); +-unlock: +- if (inode_locked) +- inode_unlock(&inode->v); + +- iocb->ki_pos += written; ++ bch2_pagecache_add_put(inode); + +- ret = max(written, written2) ?: ret; +- if (ret > 0) +- ret = generic_write_sync(iocb, ret); +- return ret; ++ return written ? written : ret; + } + +-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) ++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) + { +- ssize_t ret = iocb->ki_flags & IOCB_DIRECT +- ? bch2_direct_write(iocb, iter) +- : bch2_buffered_write(iocb, iter); ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ ssize_t ret; ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ ret = bch2_direct_write(iocb, from); ++ goto out; ++ } + ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(iocb, from); ++ if (ret <= 0) ++ goto unlock; ++ ++ ret = file_remove_privs(file); ++ if (ret) ++ goto unlock; ++ ++ ret = file_update_time(file); ++ if (ret) ++ goto unlock; ++ ++ ret = bch2_buffered_write(iocb, from); ++ if (likely(ret > 0)) ++ iocb->ki_pos += ret; ++unlock: ++ inode_unlock(&inode->v); ++ ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++out: + return bch2_err_class(ret); + } + diff --git a/queue-6.10/series b/queue-6.10/series index 7ace2e7237e..653e3066781 100644 --- a/queue-6.10/series +++ b/queue-6.10/series @@ -53,3 +53,6 @@ dm-integrity-fix-a-race-condition-when-accessing-recalc_sector.patch clocksource-hyper-v-use-lapic-timer-in-a-tdx-vm-without-paravisor.patch x86-hyperv-fix-kexec-crash-due-to-vp-assist-page-corruption.patch mm-avoid-leaving-partial-pfn-mappings-around-in-error-case.patch +bcachefs-fix-bch2_extents_match-false-positive.patch +bcachefs-revert-lockless-buffered-io-path.patch +bcachefs-don-t-delete-open-files-in-online-fsck.patch