]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 13 Sep 2024 12:47:46 +0000 (14:47 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 13 Sep 2024 12:47:46 +0000 (14:47 +0200)
added patches:
bcachefs-don-t-delete-open-files-in-online-fsck.patch
bcachefs-fix-bch2_extents_match-false-positive.patch
bcachefs-revert-lockless-buffered-io-path.patch

queue-6.10/bcachefs-don-t-delete-open-files-in-online-fsck.patch [new file with mode: 0644]
queue-6.10/bcachefs-fix-bch2_extents_match-false-positive.patch [new file with mode: 0644]
queue-6.10/bcachefs-revert-lockless-buffered-io-path.patch [new file with mode: 0644]
queue-6.10/series

diff --git a/queue-6.10/bcachefs-don-t-delete-open-files-in-online-fsck.patch b/queue-6.10/bcachefs-don-t-delete-open-files-in-online-fsck.patch
new file mode 100644 (file)
index 0000000..6e0a0a5
--- /dev/null
@@ -0,0 +1,103 @@
+From ee64e00984ec3ea3fbf9b4331cbd073690ff8765 Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Sun, 8 Sep 2024 01:06:57 -0400
+Subject: bcachefs: Don't delete open files in online fsck
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+[ Upstream commit 16005147cca41a0f67b5def2a4656286f8c0db4a ]
+
+If a file is unlinked but still open, we don't want online fsck to
+delete it - or fun inconsistencies will happen.
+
+https://github.com/koverstreet/bcachefs/issues/727
+
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/fs.c   |    8 ++++++++
+ fs/bcachefs/fs.h   |    7 +++++++
+ fs/bcachefs/fsck.c |   18 ++++++++++++++++++
+ 3 files changed, 33 insertions(+)
+
+--- a/fs/bcachefs/fs.c
++++ b/fs/bcachefs/fs.c
+@@ -177,6 +177,14 @@ static unsigned bch2_inode_hash(subvol_i
+       return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+ }
++struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
++{
++      return to_bch_ei(ilookup5_nowait(c->vfs_sb,
++                                       bch2_inode_hash(inum),
++                                       bch2_iget5_test,
++                                       &inum));
++}
++
+ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
+ {
+       subvol_inum inum = inode_inum(inode);
+--- a/fs/bcachefs/fs.h
++++ b/fs/bcachefs/fs.h
+@@ -56,6 +56,8 @@ static inline subvol_inum inode_inum(str
+       };
+ }
++struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
++
+ /*
+  * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+  * btree inode may be inconsistent:
+@@ -194,6 +196,11 @@ int bch2_vfs_init(void);
+ #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)      ({ do {} while (0); })
++static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
++{
++      return NULL;
++}
++
+ static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+                                              snapshot_id_list *s) {}
+ static inline void bch2_vfs_exit(void) {}
+--- a/fs/bcachefs/fsck.c
++++ b/fs/bcachefs/fsck.c
+@@ -8,6 +8,7 @@
+ #include "darray.h"
+ #include "dirent.h"
+ #include "error.h"
++#include "fs.h"
+ #include "fs-common.h"
+ #include "fsck.h"
+ #include "inode.h"
+@@ -948,6 +949,22 @@ fsck_err:
+       return ret;
+ }
++static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
++{
++      subvol_inum inum = {
++              .subvol = snapshot_t(c, p.snapshot)->subvol,
++              .inum   = p.offset,
++      };
++
++      /* snapshot tree corruption, can't safely delete */
++      if (!inum.subvol) {
++              bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot);
++              return true;
++      }
++
++      return __bch2_inode_hash_find(c, inum) != NULL;
++}
++
+ static int check_inode(struct btree_trans *trans,
+                      struct btree_iter *iter,
+                      struct bkey_s_c k,
+@@ -1025,6 +1042,7 @@ static int check_inode(struct btree_tran
+       }
+       if (u.bi_flags & BCH_INODE_unlinked &&
++          !bch2_inode_open(c, k.k->p) &&
+           (!c->sb.clean ||
+            fsck_err(c, inode_unlinked_but_clean,
+                     "filesystem marked clean, but inode %llu unlinked",
diff --git a/queue-6.10/bcachefs-fix-bch2_extents_match-false-positive.patch b/queue-6.10/bcachefs-fix-bch2_extents_match-false-positive.patch
new file mode 100644 (file)
index 0000000..cb974cf
--- /dev/null
@@ -0,0 +1,85 @@
+From d02c97848ab5b81489f59bfb64771a30134a7843 Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Mon, 26 Aug 2024 19:11:00 -0400
+Subject: bcachefs: Fix bch2_extents_match() false positive
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+[ Upstream commit  d26935690c03fe8159d42358bed1c56252700cd1 ]
+
+This was caught as a very rare nonce inconsistency, on systems with
+encryption and replication (and tiering, or some form of rebalance
+operation running):
+
+[Wed Jul 17 13:30:03 2024] about to insert invalid key in data update path
+[Wed Jul 17 13:30:03 2024] old: u64s 10 type extent 671283510:6392:U32_MAX len 16 ver 106595503: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:104 gen 7 ptr: 4:513244:48 gen 6 rebalance: target hdd compression zstd
+[Wed Jul 17 13:30:03 2024] k:   u64s 10 type extent 671283510:6400:U32_MAX len 16 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 ptr: 4:513244:56 gen 6 rebalance: target hdd compression zstd
+[Wed Jul 17 13:30:03 2024] new: u64s 14 type extent 671283510:6392:U32_MAX len 8 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 cached ptr: 4:513244:56 gen 6 cached rebalance: target hdd compression zstd crc: c_size 8 size 16 offset 8 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 1:10860085:32 gen 0 ptr: 0:17285918:408 gen 0
+[Wed Jul 17 13:30:03 2024] bcachefs (cca5bc65-fe77-409d-a9fa-465a6e7f4eae): fatal error - emergency read only
+
+bch2_extents_match() was reporting true for extents that did not
+actually point to the same data.
+
+bch2_extent_match() iterates over pairs of pointers, looking for
+pointers that point to the same location on disk (with matching
+generation numbers). However one or both extents may have been trimmed
+(or merged) and they might not have the same disk offset: it corrects
+for this by subtracting the key offset and the checksum entry offset.
+
+However, this failed when an extent was immediately partially
+overwritten, and the new overwrite was allocated the next adjacent disk
+space.
+
+Normally, with compression off, this would never cause a bug, since the
+new extent would have to be immediately after the old extent for the
+pointer offsets to match, and the rebalance index update path is not
+looking for an extent outside the range of the extent it moved.
+
+However with compression enabled, extents take up less space on disk
+than they do in the btree index space - and spuriously matching after
+partial overwrite is possible.
+
+To fix this, add a secondary check, that strictly checks that the
+regions pointed to on disk overlap.
+
+https://github.com/koverstreet/bcachefs/issues/717
+
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/extents.c |   23 ++++++++++++++++++++++-
+ 1 file changed, 22 insertions(+), 1 deletion(-)
+
+--- a/fs/bcachefs/extents.c
++++ b/fs/bcachefs/extents.c
+@@ -932,8 +932,29 @@ bool bch2_extents_match(struct bkey_s_c
+                       bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+                               if (p1.ptr.dev          == p2.ptr.dev &&
+                                   p1.ptr.gen          == p2.ptr.gen &&
++
++                                  /*
++                                   * This checks that the two pointers point
++                                   * to the same region on disk - adjusting
++                                   * for the difference in where the extents
++                                   * start, since one may have been trimmed:
++                                   */
+                                   (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+-                                  (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
++                                  (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
++
++                                  /*
++                                   * This additionally checks that the
++                                   * extents overlap on disk, since the
++                                   * previous check may trigger spuriously
++                                   * when one extent is immediately partially
++                                   * overwritten with another extent (so that
++                                   * on disk they are adjacent) and
++                                   * compression is in use:
++                                   */
++                                  ((p1.ptr.offset >= p2.ptr.offset &&
++                                    p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
++                                   (p2.ptr.offset >= p1.ptr.offset &&
++                                    p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
+                                       return true;
+               return false;
diff --git a/queue-6.10/bcachefs-revert-lockless-buffered-io-path.patch b/queue-6.10/bcachefs-revert-lockless-buffered-io-path.patch
new file mode 100644 (file)
index 0000000..21d548c
--- /dev/null
@@ -0,0 +1,258 @@
+From 6700f5b9af27d3feea1162fa43c4861e98f8f021 Mon Sep 17 00:00:00 2001
+From: Kent Overstreet <kent.overstreet@linux.dev>
+Date: Sat, 31 Aug 2024 17:44:51 -0400
+Subject: bcachefs: Revert lockless buffered IO path
+
+From: Kent Overstreet <kent.overstreet@linux.dev>
+
+[ Upstream commit  e3e6940940910c2287fe962bdf72015efd4fee81 ]
+
+We had a report of data corruption on nixos when building installer
+images.
+
+https://github.com/NixOS/nixpkgs/pull/321055#issuecomment-2184131334
+
+It seems that writes are being dropped, but only when issued by QEMU,
+and possibly only in snapshot mode. It's undetermined if it's write
+calls are being dropped or dirty folios.
+
+Further testing, via minimizing the original patch to just the change
+that skips the inode lock on non appends/truncates, reveals that it
+really is just not taking the inode lock that causes the corruption: it
+has nothing to do with the other logic changes for preserving write
+atomicity in corner cases.
+
+It's also kernel config dependent: it doesn't reproduce with the minimal
+kernel config that ktest uses, but it does reproduce with nixos's distro
+config. Bisection the kernel config initially pointer the finger at page
+migration or compaction, but it appears that was erroneous; we haven't
+yet determined what kernel config option actually triggers it.
+
+Sadly it appears this will have to be reverted since we're getting too
+close to release and my plate is full, but we'd _really_ like to fully
+debug it.
+
+My suspicion is that this patch is exposing a preexisting bug - the
+inode lock actually covers very little in IO paths, and we have a
+different lock (the pagecache add lock) that guards against races with
+truncate here.
+
+Fixes: 7e64c86cdc6c ("bcachefs: Buffered write path now can avoid the inode lock")
+Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/bcachefs/fs-io-buffered.c |  149 +++++++++++--------------------------------
+ 1 file changed, 40 insertions(+), 109 deletions(-)
+
+--- a/fs/bcachefs/fs-io-buffered.c
++++ b/fs/bcachefs/fs-io-buffered.c
+@@ -802,8 +802,7 @@ static noinline void folios_trunc(folios
+ static int __bch2_buffered_write(struct bch_inode_info *inode,
+                                struct address_space *mapping,
+                                struct iov_iter *iter,
+-                               loff_t pos, unsigned len,
+-                               bool inode_locked)
++                               loff_t pos, unsigned len)
+ {
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_folio_reservation res;
+@@ -828,15 +827,6 @@ static int __bch2_buffered_write(struct
+       BUG_ON(!fs.nr);
+-      /*
+-       * If we're not using the inode lock, we need to lock all the folios for
+-       * atomiticity of writes vs. other writes:
+-       */
+-      if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
+-              ret = -BCH_ERR_need_inode_lock;
+-              goto out;
+-      }
+-
+       f = darray_first(fs);
+       if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+               ret = bch2_read_single_folio(f, mapping);
+@@ -931,10 +921,8 @@ static int __bch2_buffered_write(struct
+       end = pos + copied;
+       spin_lock(&inode->v.i_lock);
+-      if (end > inode->v.i_size) {
+-              BUG_ON(!inode_locked);
++      if (end > inode->v.i_size)
+               i_size_write(&inode->v, end);
+-      }
+       spin_unlock(&inode->v.i_lock);
+       f_pos = pos;
+@@ -978,68 +966,12 @@ static ssize_t bch2_buffered_write(struc
+       struct file *file = iocb->ki_filp;
+       struct address_space *mapping = file->f_mapping;
+       struct bch_inode_info *inode = file_bch_inode(file);
+-      loff_t pos;
+-      bool inode_locked = false;
+-      ssize_t written = 0, written2 = 0, ret = 0;
+-
+-      /*
+-       * We don't take the inode lock unless i_size will be changing. Folio
+-       * locks provide exclusion with other writes, and the pagecache add lock
+-       * provides exclusion with truncate and hole punching.
+-       *
+-       * There is one nasty corner case where atomicity would be broken
+-       * without great care: when copying data from userspace to the page
+-       * cache, we do that with faults disable - a page fault would recurse
+-       * back into the filesystem, taking filesystem locks again, and
+-       * deadlock; so it's done with faults disabled, and we fault in the user
+-       * buffer when we aren't holding locks.
+-       *
+-       * If we do part of the write, but we then race and in the userspace
+-       * buffer have been evicted and are no longer resident, then we have to
+-       * drop our folio locks to re-fault them in, breaking write atomicity.
+-       *
+-       * To fix this, we restart the write from the start, if we weren't
+-       * holding the inode lock.
+-       *
+-       * There is another wrinkle after that; if we restart the write from the
+-       * start, and then get an unrecoverable error, we _cannot_ claim to
+-       * userspace that we did not write data we actually did - so we must
+-       * track (written2) the most we ever wrote.
+-       */
+-
+-      if ((iocb->ki_flags & IOCB_APPEND) ||
+-          (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
+-              inode_lock(&inode->v);
+-              inode_locked = true;
+-      }
+-
+-      ret = generic_write_checks(iocb, iter);
+-      if (ret <= 0)
+-              goto unlock;
+-
+-      ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
+-      if (ret) {
+-              if (!inode_locked) {
+-                      inode_lock(&inode->v);
+-                      inode_locked = true;
+-                      ret = file_remove_privs_flags(file, 0);
+-              }
+-              if (ret)
+-                      goto unlock;
+-      }
+-
+-      ret = file_update_time(file);
+-      if (ret)
+-              goto unlock;
+-
+-      pos = iocb->ki_pos;
++      loff_t pos = iocb->ki_pos;
++      ssize_t written = 0;
++      int ret = 0;
+       bch2_pagecache_add_get(inode);
+-      if (!inode_locked &&
+-          (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
+-              goto get_inode_lock;
+-
+       do {
+               unsigned offset = pos & (PAGE_SIZE - 1);
+               unsigned bytes = iov_iter_count(iter);
+@@ -1064,17 +996,12 @@ again:
+                       }
+               }
+-              if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
+-                      goto get_inode_lock;
+-
+               if (unlikely(fatal_signal_pending(current))) {
+                       ret = -EINTR;
+                       break;
+               }
+-              ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
+-              if (ret == -BCH_ERR_need_inode_lock)
+-                      goto get_inode_lock;
++              ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+               if (unlikely(ret < 0))
+                       break;
+@@ -1095,46 +1022,50 @@ again:
+               }
+               pos += ret;
+               written += ret;
+-              written2 = max(written, written2);
+-
+-              if (ret != bytes && !inode_locked)
+-                      goto get_inode_lock;
+               ret = 0;
+               balance_dirty_pages_ratelimited(mapping);
+-
+-              if (0) {
+-get_inode_lock:
+-                      bch2_pagecache_add_put(inode);
+-                      inode_lock(&inode->v);
+-                      inode_locked = true;
+-                      bch2_pagecache_add_get(inode);
+-
+-                      iov_iter_revert(iter, written);
+-                      pos -= written;
+-                      written = 0;
+-                      ret = 0;
+-              }
+       } while (iov_iter_count(iter));
+-      bch2_pagecache_add_put(inode);
+-unlock:
+-      if (inode_locked)
+-              inode_unlock(&inode->v);
+-      iocb->ki_pos += written;
++      bch2_pagecache_add_put(inode);
+-      ret = max(written, written2) ?: ret;
+-      if (ret > 0)
+-              ret = generic_write_sync(iocb, ret);
+-      return ret;
++      return written ? written : ret;
+ }
+-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ {
+-      ssize_t ret = iocb->ki_flags & IOCB_DIRECT
+-              ? bch2_direct_write(iocb, iter)
+-              : bch2_buffered_write(iocb, iter);
++      struct file *file = iocb->ki_filp;
++      struct bch_inode_info *inode = file_bch_inode(file);
++      ssize_t ret;
++
++      if (iocb->ki_flags & IOCB_DIRECT) {
++              ret = bch2_direct_write(iocb, from);
++              goto out;
++      }
++      inode_lock(&inode->v);
++
++      ret = generic_write_checks(iocb, from);
++      if (ret <= 0)
++              goto unlock;
++
++      ret = file_remove_privs(file);
++      if (ret)
++              goto unlock;
++
++      ret = file_update_time(file);
++      if (ret)
++              goto unlock;
++
++      ret = bch2_buffered_write(iocb, from);
++      if (likely(ret > 0))
++              iocb->ki_pos += ret;
++unlock:
++      inode_unlock(&inode->v);
++
++      if (ret > 0)
++              ret = generic_write_sync(iocb, ret);
++out:
+       return bch2_err_class(ret);
+ }
index 7ace2e7237eeabd860041a34389f4173dc310848..653e3066781c83643f00984d49f3f417e4f98c66 100644 (file)
@@ -53,3 +53,6 @@ dm-integrity-fix-a-race-condition-when-accessing-recalc_sector.patch
 clocksource-hyper-v-use-lapic-timer-in-a-tdx-vm-without-paravisor.patch
 x86-hyperv-fix-kexec-crash-due-to-vp-assist-page-corruption.patch
 mm-avoid-leaving-partial-pfn-mappings-around-in-error-case.patch
+bcachefs-fix-bch2_extents_match-false-positive.patch
+bcachefs-revert-lockless-buffered-io-path.patch
+bcachefs-don-t-delete-open-files-in-online-fsck.patch