From: Kent Overstreet Date: Sun, 29 Dec 2024 00:59:55 +0000 (-0500) Subject: bcachefs: Scrub X-Git-Tag: v6.15-rc1~146^2~145 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f269ae55d2de9c6aff5b289cd94c8eaab7b9b2c3;p=thirdparty%2Fkernel%2Flinux.git bcachefs: Scrub Add a new data op to walk all data and metadata in a filesystem, checking if it can be read successfully, and on error repairing from another copy if possible. - New helper: bch2_dev_idx_is_online(), so that we can bail out and report to userspace when we're unable to scrub because the device is offline - data_update_opts, which controls the data move path, now understands scrub: data is only read, not written. The read path is responsible for rewriting on read error, as with other reads. - scrub_pred skips data extents that don't have checksums - bch_ioctl_data has a new scrub member, which has a data_types field for data types to check - i.e. all data types, or only metadata. - Add new entries to bch_move_stats so that we can report numbers for corrected and uncorrected errors - Add a new enum to bch_ioctl_data_event for explicitly reporting completion and return code (i.e. device offline) Signed-off-by: Kent Overstreet --- diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index f1b746fac0075..e8a89d375d2f0 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -214,6 +214,10 @@ struct bch_ioctl_data { struct bpos end_pos; union { + struct { + __u32 dev; + __u32 data_types; + } scrub; struct { __u32 dev; __u32 pad; @@ -238,11 +242,19 @@ struct bch_ioctl_data_progress { __u64 sectors_done; __u64 sectors_total; + __u64 sectors_error_corrected; + __u64 sectors_error_uncorrected; } __packed __aligned(8); +enum bch_ioctl_data_event_ret { + BCH_IOCTL_DATA_EVENT_RET_done = 1, + BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, +}; + struct bch_ioctl_data_event { __u8 type; - __u8 pad[7]; + __u8 ret; + __u8 pad[6]; union { struct bch_ioctl_data_progress p; __u64 pad2[15]; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index bc1f91bf3e16e..b38a3c6fe04c9 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -313,7 +313,10 @@ static int bch2_data_thread(void *arg) struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); - ctx->stats.done = true; + if (ctx->thr.ret == -BCH_ERR_device_offline) + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; + else + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; return 0; } @@ -332,14 +335,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { - .type = BCH_DATA_EVENT_PROGRESS, - .p.data_type = ctx->stats.done ? U8_MAX : ctx->stats.data_type, - .p.btree_id = ctx->stats.pos.btree, - .p.pos = ctx->stats.pos.pos, - .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_total = bch2_fs_usage_read_short(c).used, + .type = BCH_DATA_EVENT_PROGRESS, + .ret = ctx->stats.ret, + .p.data_type = ctx->stats.data_type, + .p.btree_id = ctx->stats.pos.btree, + .p.pos = ctx->stats.pos.pos, + .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), + .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), }; + if (ctx->arg.op == BCH_DATA_OP_scrub) { + struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); + if (ca) { + struct bch_dev_usage u; + bch2_dev_usage_read_fast(ca, &u); + for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) + if (ctx->arg.scrub.data_types & BIT(i)) + e.p.sectors_total += u.d[i].sectors; + bch2_dev_put(ca); + } + } else { + e.p.sectors_total = bch2_fs_usage_read_short(c).used; + } + if (len < sizeof(e)) return -EINVAL; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index ec63dd494c801..9b79cd18d16c8 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -673,12 +673,46 @@ static bool can_allocate_without_blocking(struct bch_fs *c, return nr_replicas >= m->op.nr_replicas; } +int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); + + m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); + if (!m->bvecs) + return -ENOMEM; + + bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); + bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); + + if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { + kfree(m->bvecs); + m->bvecs = NULL; + return -ENOMEM; + } + + rbio_init(&m->rbio.bio, c, *io_opts, NULL); + m->rbio.bio.bi_iter.bi_size = buf_bytes; + m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); + m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + return 0; +} + int bch2_data_update_init(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, struct data_update *m, struct write_point_specifier wp, - struct bch_io_opts io_opts, + struct bch_io_opts *io_opts, struct data_update_opts data_opts, enum btree_id btree_id, struct bkey_s_c k) @@ -705,7 +739,7 @@ int bch2_data_update_init(struct btree_trans *trans, m->ctxt = ctxt; m->stats = ctxt ? ctxt->stats : NULL; - bch2_write_op_init(&m->op, c, io_opts); + bch2_write_op_init(&m->op, c, *io_opts); m->op.pos = bkey_start_pos(k.k); m->op.version = k.k->bversion; m->op.target = data_opts.target; @@ -716,7 +750,7 @@ int bch2_data_update_init(struct btree_trans *trans, BCH_WRITE_data_encoded| BCH_WRITE_move| m->data_opts.write_flags; - m->op.compression_opt = io_opts.background_compression; + m->op.compression_opt = io_opts->background_compression; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; unsigned durability_have = 0, durability_removing = 0; @@ -754,7 +788,7 @@ int bch2_data_update_init(struct btree_trans *trans, ptr_bit <<= 1; } - unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); + unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); /* * If current extent durability is less than io_opts.data_replicas, @@ -787,7 +821,7 @@ int bch2_data_update_init(struct btree_trans *trans, m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); + ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); if (!ret) ret = -BCH_ERR_data_update_done_no_writes_needed; goto out_bkey_buf_exit; @@ -825,33 +859,11 @@ int bch2_data_update_init(struct btree_trans *trans, goto out_nocow_unlock; } - /* write path might have to decompress data: */ - unsigned buf_bytes = 0; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - - unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); - - m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); - if (!m->bvecs) - goto enomem; - - bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); - bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); - - if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) - goto enomem; - - rbio_init(&m->rbio.bio, c, io_opts, NULL); - m->rbio.bio.bi_iter.bi_size = buf_bytes; - m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + ret = bch2_data_update_bios_init(m, c, io_opts); + if (ret) + goto out_nocow_unlock; return 0; -enomem: - ret = -ENOMEM; - kfree(m->bvecs); - m->bvecs = NULL; out_nocow_unlock: if (c->opts.nocow_enabled) bkey_nocow_unlock(c, k); diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index f4cf5d17cc374..c194cbbf5b510 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -16,6 +16,9 @@ struct data_update_opts { u8 extra_replicas; unsigned btree_insert_flags; unsigned write_flags; + + int read_dev; + bool scrub; }; void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, @@ -48,12 +51,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *, struct bch_io_opts *, struct data_update_opts *); +int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, + struct bch_io_opts *); + void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, struct moving_context *, struct data_update *, struct write_point_specifier, - struct bch_io_opts, struct data_update_opts, + struct bch_io_opts *, struct data_update_opts, enum btree_id, struct bkey_s_c); void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 6736413314825..cb30bdf52284e 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -243,7 +243,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), - orig->opts, + &orig->opts, update_opts, btree_id, k); /* @@ -488,6 +488,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, blk_status_t error) { rbio->retry = retry; + rbio->saw_error = true; if (rbio->flags & BCH_READ_in_retry) return; @@ -969,6 +970,7 @@ retry_pick: */ struct data_update *u = container_of(orig, struct data_update, rbio); if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { + BUG(); if (ca) percpu_ref_put(&ca->io_ref); goto hole; diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 5142f2818b336..73275da5d2c4b 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -41,6 +41,7 @@ struct bch_read_bio { have_ioref:1, narrow_crcs:1, hole:1, + saw_error:1, retry:2, context:2; }; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index ff396b33ef242..7614370f45909 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -89,7 +89,12 @@ static void move_free(struct moving_io *io) wake_up(&ctxt->wait); mutex_unlock(&ctxt->lock); - bch2_data_update_exit(&io->write); + if (!io->write.data_opts.scrub) { + bch2_data_update_exit(&io->write); + } else { + bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); + kfree(io->write.bvecs); + } kfree(io); } @@ -109,7 +114,20 @@ static void move_write_done(struct bch_write_op *op) static void move_write(struct moving_io *io) { - if (unlikely(io->write.rbio.bio.bi_status || io->write.rbio.hole)) { + struct moving_context *ctxt = io->write.ctxt; + + if (ctxt->stats) { + if (io->write.rbio.bio.bi_status) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_uncorrected); + else if (io->write.rbio.saw_error) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_corrected); + } + + if (unlikely(io->write.rbio.bio.bi_status || + io->write.rbio.hole || + io->write.data_opts.scrub)) { move_free(io); return; } @@ -263,7 +281,8 @@ int bch2_move_extent(struct moving_context *ctxt, bch2_data_update_opts_normalize(k, &data_opts); if (!data_opts.rewrite_ptrs && - !data_opts.extra_replicas) { + !data_opts.extra_replicas && + !data_opts.scrub) { if (data_opts.kill_ptrs) return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); return 0; @@ -284,16 +303,28 @@ int bch2_move_extent(struct moving_context *ctxt, io->read_sectors = k.k->size; io->write_sectors = k.k->size; - ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, - io_opts, data_opts, iter->btree_id, k); - if (ret) - goto err_free; + if (!data_opts.scrub) { + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, + &io_opts, data_opts, iter->btree_id, k); + if (ret) + goto err_free; + + io->write.op.end_io = move_write_done; + } else { + bch2_bkey_buf_init(&io->write.k); + bch2_bkey_buf_reassemble(&io->write.k, c, k); + + io->write.op.c = c; + io->write.data_opts = data_opts; + + ret = bch2_data_update_bios_init(&io->write, c, &io_opts); + if (ret) + goto err_free; + } io->write.rbio.bio.bi_end_io = move_read_endio; io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - io->write.op.end_io = move_write_done; - if (ctxt->rate) bch2_ratelimit_increment(ctxt->rate, k.k->size); @@ -324,11 +355,14 @@ int bch2_move_extent(struct moving_context *ctxt, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(trans, &io->write.rbio, - bkey_start_pos(k.k), - iter->btree_id, k, 0, - BCH_READ_data_update| - BCH_READ_last_fragment); + __bch2_read_extent(trans, &io->write.rbio, + io->write.rbio.bio.bi_iter, + bkey_start_pos(k.k), + iter->btree_id, k, 0, + NULL, + BCH_READ_data_update| + BCH_READ_last_fragment, + data_opts.scrub ? data_opts.read_dev : -1); return 0; err_free: kfree(io); @@ -669,6 +703,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, unsigned dev, u64 bucket_start, u64 bucket_end, + unsigned data_types, move_pred_fn pred, void *arg) { struct btree_trans *trans = ctxt->trans; @@ -737,6 +772,9 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (ctxt->stats) ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + if (!(data_types & BIT(bp.v->data_type))) + goto next; + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -760,17 +798,25 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, goto next; } + if (data_opts.scrub && + !bch2_dev_idx_is_online(c, data_opts.read_dev)) { + bch2_trans_iter_exit(trans, &iter); + ret = -BCH_ERR_device_offline; + break; + } + bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); /* move_extent will drop locks */ - unsigned sectors = !bp.v->level - ? bp.v->bucket_len - : btree_ptr_sectors_written(k); + unsigned sectors = bp.v->bucket_len; - ret = !bp.v->level - ? bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts) - : bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + if (!bp.v->level) + ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); + else if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + else + ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); bch2_trans_iter_exit(trans, &iter); @@ -797,6 +843,30 @@ err: return ret; } +static int bch2_move_data_phys(struct bch_fs *c, + unsigned dev, + u64 start, + u64 end, + unsigned data_types, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ + struct moving_context ctxt; + + bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ctxt.stats->phys = true; + + int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); + bch2_moving_ctxt_exit(&ctxt); + + return ret; +} + struct evacuate_bucket_arg { struct bpos bucket; int gen; @@ -834,6 +904,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bucket.inode, bucket.offset, bucket.offset + 1, + ~0, evacuate_bucket_pred, &arg); } @@ -1075,6 +1146,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } +static bool scrub_pred(struct bch_fs *c, void *_arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct bch_ioctl_data *arg = _arg; + + if (k.k->type != KEY_TYPE_btree_ptr_v2) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == arg->migrate.dev) { + if (!p.crc.csum_type) + return false; + break; + } + } + + data_opts->scrub = true; + data_opts->read_dev = arg->migrate.dev; + return true; +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) @@ -1089,6 +1184,22 @@ int bch2_data_job(struct bch_fs *c, bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); switch (op.op) { + case BCH_DATA_OP_scrub: + /* + * prevent tests from spuriously failing, make sure we see all + * btree nodes that need to be repaired + */ + bch2_btree_interior_updates_flush(c); + + ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, + op.scrub.data_types, + NULL, + stats, + writepoint_hashed((unsigned long) current), + false, + scrub_pred, &op) ?: ret; + break; + case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index 15d1f7f3d1dc6..82e473ed48d2d 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -3,11 +3,12 @@ #define _BCACHEFS_MOVE_TYPES_H #include "bbpos_types.h" +#include "bcachefs_ioctl.h" struct bch_move_stats { char name[32]; bool phys; - bool done; + enum bch_ioctl_data_event_ret ret; union { struct { @@ -25,6 +26,8 @@ struct bch_move_stats { atomic64_t sectors_seen; atomic64_t sectors_moved; atomic64_t sectors_raced; + atomic64_t sectors_error_corrected; + atomic64_t sectors_error_uncorrected; }; struct move_bucket_key { diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 762083b564ee5..b29b6c6c21dd2 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -23,6 +23,18 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) return !percpu_ref_is_zero(&ca->io_ref); } +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); + +static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + bool ret = ca && bch2_dev_is_online(ca); + rcu_read_unlock(); + + return ret; +} + static inline bool bch2_dev_is_readable(struct bch_dev *ca) { return bch2_dev_is_online(ca) &&