struct bpos end_pos;
union {
+ struct {
+ __u32 dev;
+ __u32 data_types;
+ } scrub;
struct {
__u32 dev;
__u32 pad;
__u64 sectors_done;
__u64 sectors_total;
+ __u64 sectors_error_corrected;
+ __u64 sectors_error_uncorrected;
} __packed __aligned(8);
+enum bch_ioctl_data_event_ret {
+ BCH_IOCTL_DATA_EVENT_RET_done = 1,
+ BCH_IOCTL_DATA_EVENT_RET_device_offline = 2,
+};
+
struct bch_ioctl_data_event {
__u8 type;
- __u8 pad[7];
+ __u8 ret;
+ __u8 pad[6];
union {
struct bch_ioctl_data_progress p;
__u64 pad2[15];
struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
- ctx->stats.done = true;
+ if (ctx->thr.ret == -BCH_ERR_device_offline)
+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
+ else
+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
return 0;
}
struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
struct bch_fs *c = ctx->c;
struct bch_ioctl_data_event e = {
- .type = BCH_DATA_EVENT_PROGRESS,
- .p.data_type = ctx->stats.done ? U8_MAX : ctx->stats.data_type,
- .p.btree_id = ctx->stats.pos.btree,
- .p.pos = ctx->stats.pos.pos,
- .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
- .p.sectors_total = bch2_fs_usage_read_short(c).used,
+ .type = BCH_DATA_EVENT_PROGRESS,
+ .ret = ctx->stats.ret,
+ .p.data_type = ctx->stats.data_type,
+ .p.btree_id = ctx->stats.pos.btree,
+ .p.pos = ctx->stats.pos.pos,
+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
+ .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected),
+ .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected),
};
+ if (ctx->arg.op == BCH_DATA_OP_scrub) {
+ struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
+ if (ca) {
+ struct bch_dev_usage u;
+ bch2_dev_usage_read_fast(ca, &u);
+ for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
+ if (ctx->arg.scrub.data_types & BIT(i))
+ e.p.sectors_total += u.d[i].sectors;
+ bch2_dev_put(ca);
+ }
+ } else {
+ e.p.sectors_total = bch2_fs_usage_read_short(c).used;
+ }
+
if (len < sizeof(e))
return -EINVAL;
return nr_replicas >= m->op.nr_replicas;
}
+int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
+ struct bch_io_opts *io_opts)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ /* write path might have to decompress data: */
+ unsigned buf_bytes = 0;
+ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
+
+ unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
+
+ m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
+ if (!m->bvecs)
+ return -ENOMEM;
+
+ bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
+ bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
+
+ if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
+ kfree(m->bvecs);
+ m->bvecs = NULL;
+ return -ENOMEM;
+ }
+
+ rbio_init(&m->rbio.bio, c, *io_opts, NULL);
+ m->rbio.bio.bi_iter.bi_size = buf_bytes;
+ m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
+ m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
+ return 0;
+}
+
int bch2_data_update_init(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
struct data_update *m,
struct write_point_specifier wp,
- struct bch_io_opts io_opts,
+ struct bch_io_opts *io_opts,
struct data_update_opts data_opts,
enum btree_id btree_id,
struct bkey_s_c k)
m->ctxt = ctxt;
m->stats = ctxt ? ctxt->stats : NULL;
- bch2_write_op_init(&m->op, c, io_opts);
+ bch2_write_op_init(&m->op, c, *io_opts);
m->op.pos = bkey_start_pos(k.k);
m->op.version = k.k->bversion;
m->op.target = data_opts.target;
BCH_WRITE_data_encoded|
BCH_WRITE_move|
m->data_opts.write_flags;
- m->op.compression_opt = io_opts.background_compression;
+ m->op.compression_opt = io_opts->background_compression;
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
unsigned durability_have = 0, durability_removing = 0;
ptr_bit <<= 1;
}
- unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+ unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/*
* If current extent durability is less than io_opts.data_replicas,
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
- ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
+ ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
if (!ret)
ret = -BCH_ERR_data_update_done_no_writes_needed;
goto out_bkey_buf_exit;
goto out_nocow_unlock;
}
- /* write path might have to decompress data: */
- unsigned buf_bytes = 0;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
-
- unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
-
- m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
- if (!m->bvecs)
- goto enomem;
-
- bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
- bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
-
- if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL))
- goto enomem;
-
- rbio_init(&m->rbio.bio, c, io_opts, NULL);
- m->rbio.bio.bi_iter.bi_size = buf_bytes;
- m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
- m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
+ ret = bch2_data_update_bios_init(m, c, io_opts);
+ if (ret)
+ goto out_nocow_unlock;
return 0;
-enomem:
- ret = -ENOMEM;
- kfree(m->bvecs);
- m->bvecs = NULL;
out_nocow_unlock:
if (c->opts.nocow_enabled)
bkey_nocow_unlock(c, k);
u8 extra_replicas;
unsigned btree_insert_flags;
unsigned write_flags;
+
+ int read_dev;
+ bool scrub;
};
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
struct bch_io_opts *,
struct data_update_opts *);
+int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
+ struct bch_io_opts *);
+
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
struct moving_context *,
struct data_update *,
struct write_point_specifier,
- struct bch_io_opts, struct data_update_opts,
+ struct bch_io_opts *, struct data_update_opts,
enum btree_id, struct bkey_s_c);
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current),
- orig->opts,
+ &orig->opts,
update_opts,
btree_id, k);
/*
blk_status_t error)
{
rbio->retry = retry;
+ rbio->saw_error = true;
if (rbio->flags & BCH_READ_in_retry)
return;
*/
struct data_update *u = container_of(orig, struct data_update, rbio);
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
+ BUG();
if (ca)
percpu_ref_put(&ca->io_ref);
goto hole;
have_ioref:1,
narrow_crcs:1,
hole:1,
+ saw_error:1,
retry:2,
context:2;
};
wake_up(&ctxt->wait);
mutex_unlock(&ctxt->lock);
- bch2_data_update_exit(&io->write);
+ if (!io->write.data_opts.scrub) {
+ bch2_data_update_exit(&io->write);
+ } else {
+ bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
+ kfree(io->write.bvecs);
+ }
kfree(io);
}
static void move_write(struct moving_io *io)
{
- if (unlikely(io->write.rbio.bio.bi_status || io->write.rbio.hole)) {
+ struct moving_context *ctxt = io->write.ctxt;
+
+ if (ctxt->stats) {
+ if (io->write.rbio.bio.bi_status)
+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
+ &ctxt->stats->sectors_error_uncorrected);
+ else if (io->write.rbio.saw_error)
+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
+ &ctxt->stats->sectors_error_corrected);
+ }
+
+ if (unlikely(io->write.rbio.bio.bi_status ||
+ io->write.rbio.hole ||
+ io->write.data_opts.scrub)) {
move_free(io);
return;
}
bch2_data_update_opts_normalize(k, &data_opts);
if (!data_opts.rewrite_ptrs &&
- !data_opts.extra_replicas) {
+ !data_opts.extra_replicas &&
+ !data_opts.scrub) {
if (data_opts.kill_ptrs)
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
return 0;
io->read_sectors = k.k->size;
io->write_sectors = k.k->size;
- ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
- io_opts, data_opts, iter->btree_id, k);
- if (ret)
- goto err_free;
+ if (!data_opts.scrub) {
+ ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
+ &io_opts, data_opts, iter->btree_id, k);
+ if (ret)
+ goto err_free;
+
+ io->write.op.end_io = move_write_done;
+ } else {
+ bch2_bkey_buf_init(&io->write.k);
+ bch2_bkey_buf_reassemble(&io->write.k, c, k);
+
+ io->write.op.c = c;
+ io->write.data_opts = data_opts;
+
+ ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
+ if (ret)
+ goto err_free;
+ }
io->write.rbio.bio.bi_end_io = move_read_endio;
io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
- io->write.op.end_io = move_write_done;
-
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
- bch2_read_extent(trans, &io->write.rbio,
- bkey_start_pos(k.k),
- iter->btree_id, k, 0,
- BCH_READ_data_update|
- BCH_READ_last_fragment);
+ __bch2_read_extent(trans, &io->write.rbio,
+ io->write.rbio.bio.bi_iter,
+ bkey_start_pos(k.k),
+ iter->btree_id, k, 0,
+ NULL,
+ BCH_READ_data_update|
+ BCH_READ_last_fragment,
+ data_opts.scrub ? data_opts.read_dev : -1);
return 0;
err_free:
kfree(io);
unsigned dev,
u64 bucket_start,
u64 bucket_end,
+ unsigned data_types,
move_pred_fn pred, void *arg)
{
struct btree_trans *trans = ctxt->trans;
if (ctxt->stats)
ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+ if (!(data_types & BIT(bp.v->data_type)))
+ goto next;
+
k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto next;
}
+ if (data_opts.scrub &&
+ !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
+ bch2_trans_iter_exit(trans, &iter);
+ ret = -BCH_ERR_device_offline;
+ break;
+ }
+
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
/* move_extent will drop locks */
- unsigned sectors = !bp.v->level
- ? bp.v->bucket_len
- : btree_ptr_sectors_written(k);
+ unsigned sectors = bp.v->bucket_len;
- ret = !bp.v->level
- ? bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts)
- : bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
+ if (!bp.v->level)
+ ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
+ else if (!data_opts.scrub)
+ ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
+ else
+ ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
+static int bch2_move_data_phys(struct bch_fs *c,
+ unsigned dev,
+ u64 start,
+ u64 end,
+ unsigned data_types,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
+{
+ struct moving_context ctxt;
+
+ bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
+
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ ctxt.stats->phys = true;
+
+ int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg);
+ bch2_moving_ctxt_exit(&ctxt);
+
+ return ret;
+}
+
struct evacuate_bucket_arg {
struct bpos bucket;
int gen;
bucket.inode,
bucket.offset,
bucket.offset + 1,
+ ~0,
evacuate_bucket_pred, &arg);
}
return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
+static bool scrub_pred(struct bch_fs *c, void *_arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ struct bch_ioctl_data *arg = _arg;
+
+ if (k.k->type != KEY_TYPE_btree_ptr_v2) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev == arg->migrate.dev) {
+ if (!p.crc.csum_type)
+ return false;
+ break;
+ }
+ }
+
+ data_opts->scrub = true;
+ data_opts->read_dev = arg->migrate.dev;
+ return true;
+}
+
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
switch (op.op) {
+ case BCH_DATA_OP_scrub:
+ /*
+ * prevent tests from spuriously failing, make sure we see all
+ * btree nodes that need to be repaired
+ */
+ bch2_btree_interior_updates_flush(c);
+
+ ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
+ op.scrub.data_types,
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ false,
+ scrub_pred, &op) ?: ret;
+ break;
+
case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
#define _BCACHEFS_MOVE_TYPES_H
#include "bbpos_types.h"
+#include "bcachefs_ioctl.h"
struct bch_move_stats {
char name[32];
bool phys;
- bool done;
+ enum bch_ioctl_data_event_ret ret;
union {
struct {
atomic64_t sectors_seen;
atomic64_t sectors_moved;
atomic64_t sectors_raced;
+ atomic64_t sectors_error_corrected;
+ atomic64_t sectors_error_uncorrected;
};
struct move_bucket_key {
return !percpu_ref_is_zero(&ca->io_ref);
}
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
+
+static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ bool ret = ca && bch2_dev_is_online(ca);
+ rcu_read_unlock();
+
+ return ret;
+}
+
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
{
return bch2_dev_is_online(ca) &&