]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
bcachefs: Don't delete reflink pointers to missing indirect extents
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 21 Oct 2024 00:27:44 +0000 (20:27 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sat, 21 Dec 2024 06:36:18 +0000 (01:36 -0500)
To avoid tragic loss in the event of transient errors (i.e., a btree
node topology error that was later corrected by btree node scan), we
can't delete reflink pointers to correct errors.

This adds a new error bit to bch_reflink_p, indicating that it is known
to point to a missing indirect extent, and the error has already been
reported.

Indirect extent lookups now use bch2_lookup_indirect_extent(), which on
error reports it as a fsck_err() and sets the error bit, and clears it
if necessary on succesful lookup.

This also gets rid of the bch2_inconsistent_error() call in
__bch2_read_indirect_extent, and in the reflink_p trigger: part of the
online self healing project.

An on disk format change isn't necessary here: setting the error bit
will be interpreted by older versions as pointing to a different index,
which will also be missing - which is fine.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/fs-io-buffered.c
fs/bcachefs/fs.c
fs/bcachefs/io_read.c
fs/bcachefs/io_read.h
fs/bcachefs/reflink.c
fs/bcachefs/reflink.h
fs/bcachefs/reflink_format.h

index b853cecd3c1b965f53537bf6ff5b31be72b3cb3d..d55e215e8aa62a8b3796799a691562fd62107178 100644 (file)
@@ -164,7 +164,8 @@ static void bchfs_read(struct btree_trans *trans,
                             BTREE_ITER_slots);
        while (1) {
                struct bkey_s_c k;
-               unsigned bytes, sectors, offset_into_extent;
+               unsigned bytes, sectors;
+               s64 offset_into_extent;
                enum btree_id data_btree = BTREE_ID_extents;
 
                bch2_trans_begin(trans);
@@ -197,7 +198,7 @@ static void bchfs_read(struct btree_trans *trans,
 
                k = bkey_i_to_s_c(sk.k);
 
-               sectors = min(sectors, k.k->size - offset_into_extent);
+               sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
 
                if (readpages_iter) {
                        ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
index f852dbf30aa2fee795bb62dcf97a7bdbf38db1d7..50d323fca001f615da7693dae39d4a79cc5bd048 100644 (file)
@@ -1261,7 +1261,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf cur, prev;
-       unsigned offset_into_extent, sectors;
        bool have_extent = false;
        int ret = 0;
 
@@ -1308,9 +1307,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
                        continue;
                }
 
-               offset_into_extent      = iter.pos.offset -
-                       bkey_start_offset(k.k);
-               sectors                 = k.k->size - offset_into_extent;
+               s64 offset_into_extent  = iter.pos.offset - bkey_start_offset(k.k);
+               unsigned sectors        = k.k->size - offset_into_extent;
 
                bch2_bkey_buf_reassemble(&cur, c, k);
 
@@ -1322,7 +1320,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
                k = bkey_i_to_s_c(cur.k);
                bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
 
-               sectors = min(sectors, k.k->size - offset_into_extent);
+               sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
 
                bch2_cut_front(POS(k.k->p.inode,
                                   bkey_start_offset(k.k) +
index c700a95df89e3c8263ab689093358c5cd9f9d29e..eb8d12fd6398482714dacf692e390d25a6f7be76 100644 (file)
@@ -21,6 +21,7 @@
 #include "io_read.h"
 #include "io_misc.h"
 #include "io_write.h"
+#include "reflink.h"
 #include "subvolume.h"
 #include "trace.h"
 
@@ -750,41 +751,6 @@ static void bch2_read_endio(struct bio *bio)
        bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
 }
 
-int __bch2_read_indirect_extent(struct btree_trans *trans,
-                               unsigned *offset_into_extent,
-                               struct bkey_buf *orig_k)
-{
-       struct bkey_i_reflink_p *p = bkey_i_to_reflink_p(orig_k->k);
-       u64 reflink_offset = REFLINK_P_IDX(&p->v) + *offset_into_extent;
-
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
-                              POS(0, reflink_offset), 0);
-       int ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       if (k.k->type != KEY_TYPE_reflink_v &&
-           k.k->type != KEY_TYPE_indirect_inline_data) {
-               bch_err_inum_offset_ratelimited(trans->c,
-                       orig_k->k->k.p.inode,
-                       orig_k->k->k.p.offset << 9,
-                       "%llu len %u points to nonexistent indirect extent %llu",
-                       orig_k->k->k.p.offset,
-                       orig_k->k->k.size,
-                       reflink_offset);
-               bch2_inconsistent_error(trans->c);
-               ret = -BCH_ERR_missing_indirect_extent;
-               goto err;
-       }
-
-       *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
-       bch2_bkey_buf_reassemble(orig_k, trans->c, k);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
                                                   struct bch_dev *ca,
                                                   struct bkey_s_c k,
@@ -1160,7 +1126,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
                             BTREE_ITER_slots);
 
        while (1) {
-               unsigned bytes, sectors, offset_into_extent;
                enum btree_id data_btree = BTREE_ID_extents;
 
                bch2_trans_begin(trans);
@@ -1180,9 +1145,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
                if (ret)
                        goto err;
 
-               offset_into_extent = iter.pos.offset -
+               s64 offset_into_extent = iter.pos.offset -
                        bkey_start_offset(k.k);
-               sectors = k.k->size - offset_into_extent;
+               unsigned sectors = k.k->size - offset_into_extent;
 
                bch2_bkey_buf_reassemble(&sk, c, k);
 
@@ -1197,9 +1162,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
                 * With indirect extents, the amount of data to read is the min
                 * of the original extent and the indirect extent:
                 */
-               sectors = min(sectors, k.k->size - offset_into_extent);
+               sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
 
-               bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+               unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
                swap(bvec_iter.bi_size, bytes);
 
                if (bvec_iter.bi_size == bytes)
index d9c18bb7d4035aee9884de2dc2019e4a97f7249d..a82e8a94ccb612e1e638f093350b6ee62c6501a5 100644 (file)
@@ -3,6 +3,7 @@
 #define _BCACHEFS_IO_READ_H
 
 #include "bkey_buf.h"
+#include "reflink.h"
 
 struct bch_read_bio {
        struct bch_fs           *c;
@@ -79,19 +80,32 @@ struct bch_devs_mask;
 struct cache_promote_op;
 struct extent_ptr_decoded;
 
-int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
-                               struct bkey_buf *);
-
 static inline int bch2_read_indirect_extent(struct btree_trans *trans,
                                            enum btree_id *data_btree,
-                                           unsigned *offset_into_extent,
-                                           struct bkey_buf *k)
+                                           s64 *offset_into_extent,
+                                           struct bkey_buf *extent)
 {
-       if (k->k->k.type != KEY_TYPE_reflink_p)
+       if (extent->k->k.type != KEY_TYPE_reflink_p)
                return 0;
 
        *data_btree = BTREE_ID_reflink;
-       return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+       struct btree_iter iter;
+       struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
+                                               offset_into_extent,
+                                               bkey_i_to_s_c_reflink_p(extent->k),
+                                               true, 0);
+       int ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       if (bkey_deleted(k.k)) {
+               bch2_trans_iter_exit(trans, &iter);
+               return -BCH_ERR_missing_indirect_extent;
+       }
+
+       bch2_bkey_buf_reassemble(extent, trans->c, k);
+       bch2_trans_iter_exit(trans, &iter);
+       return 0;
 }
 
 enum bch_read_flags {
index 36fb1e9473ff6903c32d63bc754032b62cf23eaf..38db5a0117027122831b73676479cb933acfdaac 100644 (file)
 
 #include <linux/sched/signal.h>
 
+static inline bool bkey_extent_is_reflink_data(const struct bkey *k)
+{
+       switch (k->type) {
+       case KEY_TYPE_reflink_v:
+       case KEY_TYPE_indirect_inline_data:
+               return true;
+       default:
+               return false;
+       }
+}
+
 static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 {
        switch (k->type) {
@@ -68,6 +79,9 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
        if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v))
                return false;
 
+       if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v))
+               return false;
+
        bch2_key_resize(l.k, l.k->size + r.k->size);
        return true;
 }
@@ -130,6 +144,144 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
               min(datalen, 32U), d.v->data);
 }
 
+/* lookup */
+
+static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p,
+                                           bool should_commit)
+{
+       struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
+       int ret = PTR_ERR_OR_ZERO(new);
+       if (ret)
+               return ret;
+
+       SET_REFLINK_P_ERROR(&new->v, false);
+       ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
+       if (ret)
+               return ret;
+
+       if (!should_commit)
+               return 0;
+
+       return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+               -BCH_ERR_transaction_restart_nested;
+}
+
+static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
+                                             struct bkey_s_c_reflink_p p,
+                                             u64 missing_start, u64 missing_end,
+                                             bool should_commit)
+{
+       if (REFLINK_P_ERROR(p.v))
+               return -BCH_ERR_missing_indirect_extent;
+
+       struct bch_fs *c = trans->c;
+       u64 live_start  = REFLINK_P_IDX(p.v);
+       u64 live_end    = REFLINK_P_IDX(p.v) + p.k->size;
+       u64 refd_start  = live_start    - le32_to_cpu(p.v->front_pad);
+       u64 refd_end    = live_end      + le32_to_cpu(p.v->back_pad);
+       struct printbuf buf = PRINTBUF;
+       int ret = 0;
+
+       BUG_ON(missing_start    < refd_start);
+       BUG_ON(missing_end      > refd_end);
+
+       if (fsck_err(trans, reflink_p_to_missing_reflink_v,
+                    "pointer to missing indirect extent\n"
+                    "  %s\n"
+                    "  missing range %llu-%llu",
+                    (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+                    missing_start, missing_end)) {
+               struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
+               ret = PTR_ERR_OR_ZERO(new);
+               if (ret)
+                       goto err;
+
+               /*
+                * Is the missing range not actually needed?
+                *
+                * p.v->idx refers to the data that we actually want, but if the
+                * indirect extent we point to was bigger, front_pad and back_pad
+                * indicate the range we took a reference on.
+                */
+
+               if (missing_end <= live_start) {
+                       new->v.front_pad = cpu_to_le32(live_start - missing_end);
+               } else if (missing_start >= live_end) {
+                       new->v.back_pad = cpu_to_le32(missing_start - live_end);
+               } else {
+                       struct bpos new_start   = bkey_start_pos(&new->k);
+                       struct bpos new_end     = new->k.p;
+
+                       if (missing_start > live_start)
+                               new_start.offset += missing_start - live_start;
+                       if (missing_end < live_end)
+                               new_end.offset -= live_end - missing_end;
+
+                       bch2_cut_front(new_start, &new->k_i);
+                       bch2_cut_back(new_end, &new->k_i);
+
+                       SET_REFLINK_P_ERROR(&new->v, true);
+               }
+
+               ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
+               if (ret)
+                       goto err;
+
+               if (should_commit)
+                       ret =   bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+                               -BCH_ERR_transaction_restart_nested;
+       }
+err:
+fsck_err:
+       printbuf_exit(&buf);
+       return ret;
+}
+
+/*
+ * This is used from the read path, which doesn't expect to have to do a
+ * transaction commit, and from triggers, which should not be doing a commit:
+ */
+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
+                                           struct btree_iter *iter,
+                                           s64 *offset_into_extent,
+                                           struct bkey_s_c_reflink_p p,
+                                           bool should_commit,
+                                           unsigned iter_flags)
+{
+       BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad)));
+       BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad));
+
+       u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent;
+
+       struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink,
+                                      POS(0, reflink_offset), iter_flags);
+       if (bkey_err(k))
+               return k;
+
+       if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
+               bch2_trans_iter_exit(trans, iter);
+
+               unsigned size = min((u64) k.k->size,
+                                   REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) -
+                                   reflink_offset);
+               bch2_key_resize(&iter->k, size);
+
+               int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
+                                                            k.k->p.offset, should_commit);
+               if (ret)
+                       return bkey_s_c_err(ret);
+       } else if (unlikely(REFLINK_P_ERROR(p.v))) {
+               bch2_trans_iter_exit(trans, iter);
+
+               int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
+               if (ret)
+                       return bkey_s_c_err(ret);
+       }
+
+       *offset_into_extent = reflink_offset - bkey_start_offset(k.k);
+       return k;
+}
+
 /* reflink pointer trigger */
 
 static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
@@ -137,37 +289,37 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
                        enum btree_iter_update_trigger_flags flags)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_i *k;
-       __le64 *refcount;
-       int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
        struct printbuf buf = PRINTBUF;
-       int ret;
 
-       k = bch2_bkey_get_mut_noupdate(trans, &iter,
-                       BTREE_ID_reflink, POS(0, *idx),
-                       BTREE_ITER_with_updates);
-       ret = PTR_ERR_OR_ZERO(k);
+       s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v);
+       struct btree_iter iter;
+       struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false,
+                                                       BTREE_ITER_intent|
+                                                       BTREE_ITER_with_updates);
+       int ret = bkey_err(k);
        if (ret)
-               goto err;
+               return ret;
 
-       refcount = bkey_refcount(bkey_i_to_s(k));
-       if (!refcount) {
-               bch2_bkey_val_to_text(&buf, c, p.s_c);
-               bch2_trans_inconsistent(trans,
-                       "nonexistent indirect extent at %llu while marking\n  %s",
-                       *idx, buf.buf);
-               ret = -EIO;
-               goto err;
+       if (bkey_deleted(k.k)) {
+               if (!(flags & BTREE_TRIGGER_overwrite))
+                       ret = -BCH_ERR_missing_indirect_extent;
+               goto next;
        }
 
+       struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+       ret = PTR_ERR_OR_ZERO(new);
+       if (ret)
+               goto err;
+
+       __le64 *refcount = bkey_refcount(bkey_i_to_s(new));
        if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
                bch2_bkey_val_to_text(&buf, c, p.s_c);
-               bch2_trans_inconsistent(trans,
-                       "indirect extent refcount underflow at %llu while marking\n  %s",
-                       *idx, buf.buf);
-               ret = -EIO;
-               goto err;
+               prt_printf(&buf, "\n  ");
+               bch2_bkey_val_to_text(&buf, c, k);
+               log_fsck_err(trans, reflink_refcount_underflow,
+                            "indirect extent refcount underflow while marking\n  %s",
+                          buf.buf);
+               goto next;
        }
 
        if (flags & BTREE_TRIGGER_insert) {
@@ -175,25 +327,26 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
                u64 pad;
 
                pad = max_t(s64, le32_to_cpu(v->front_pad),
-                           REFLINK_P_IDX(v) - bkey_start_offset(&k->k));
+                           REFLINK_P_IDX(v) - bkey_start_offset(&new->k));
                BUG_ON(pad > U32_MAX);
                v->front_pad = cpu_to_le32(pad);
 
                pad = max_t(s64, le32_to_cpu(v->back_pad),
-                           k->k.p.offset - p.k->size - REFLINK_P_IDX(v));
+                           new->k.p.offset - p.k->size - REFLINK_P_IDX(v));
                BUG_ON(pad > U32_MAX);
                v->back_pad = cpu_to_le32(pad);
        }
 
-       le64_add_cpu(refcount, add);
+       le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1);
 
        bch2_btree_iter_set_pos_to_extent_start(&iter);
-       ret = bch2_trans_update(trans, &iter, k, 0);
+       ret = bch2_trans_update(trans, &iter, new, 0);
        if (ret)
                goto err;
-
-       *idx = k->k.p.offset;
+next:
+       *idx = k.k->p.offset;
 err:
+fsck_err:
        bch2_trans_iter_exit(trans, &iter);
        printbuf_exit(&buf);
        return ret;
@@ -207,9 +360,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct reflink_gc *r;
        int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
-       u64 start = REFLINK_P_IDX(p.v);
-       u64 end = start + p.k->size;
-       u64 next_idx = end + le32_to_cpu(p.v->back_pad);
+       u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
        s64 ret = 0;
        struct printbuf buf = PRINTBUF;
 
@@ -228,36 +379,14 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
        *idx = r->offset;
        return 0;
 not_found:
-       BUG_ON(!(flags & BTREE_TRIGGER_check_repair));
-
-       if (fsck_err(trans, reflink_p_to_missing_reflink_v,
-                    "pointer to missing indirect extent\n"
-                    "  %s\n"
-                    "  missing range %llu-%llu",
-                    (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
-                    *idx, next_idx)) {
-               struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c);
-               ret = PTR_ERR_OR_ZERO(update);
+       if (flags & BTREE_TRIGGER_check_repair) {
+               ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
                if (ret)
                        goto err;
-
-               if (next_idx <= start) {
-                       bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx);
-               } else if (*idx >= end) {
-                       bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end);
-               } else {
-                       bkey_error_init(update);
-                       update->k.p             = p.k->p;
-                       update->k.size          = p.k->size;
-                       set_bkey_val_u64s(&update->k, 0);
-               }
-
-               ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun);
        }
 
        *idx = next_idx;
 err:
-fsck_err:
        printbuf_exit(&buf);
        return ret;
 }
index 6ec3a9ea6bb479e15cd72104c7650962d61ecef7..b61a4bdd8e829219c53b58ebeef40d2ff22cc99f 100644 (file)
@@ -73,6 +73,10 @@ static inline __le64 *bkey_refcount(struct bkey_s k)
        }
 }
 
+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *,
+                                           s64 *, struct bkey_s_c_reflink_p,
+                                           bool, unsigned);
+
 s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
                     subvol_inum, u64, u64, u64, s64 *);
 
index 0d8de13b9ddf7335c0d4b0bcb98d438e59c1536a..53502627b2c5b037e4ebeffcc3b69f8f41864cdb 100644 (file)
@@ -18,6 +18,7 @@ struct bch_reflink_p {
 } __packed __aligned(8);
 
 LE64_BITMASK(REFLINK_P_IDX,    struct bch_reflink_p, idx_flags,  0, 56);
+LE64_BITMASK(REFLINK_P_ERROR,  struct bch_reflink_p, idx_flags, 56, 57);
 
 struct bch_reflink_v {
        struct bch_val          v;