1 // SPDX-License-Identifier: GPL-2.0
4 #include "alloc_background.h"
6 #include "btree_journal_iter.h"
7 #include "btree_node_scan.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
15 #include "fs-common.h"
16 #include "journal_io.h"
17 #include "journal_reclaim.h"
18 #include "journal_seq_blacklist.h"
19 #include "logged_ops.h"
22 #include "rebalance.h"
24 #include "recovery_passes.h"
27 #include "sb-downgrade.h"
31 #include <linux/sort.h>
32 #include <linux/stat.h>
34 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
36 static bool btree_id_is_alloc(enum btree_id id
)
40 case BTREE_ID_backpointers
:
41 case BTREE_ID_need_discard
:
42 case BTREE_ID_freespace
:
43 case BTREE_ID_bucket_gens
:
50 /* for -o reconstruct_alloc: */
51 static void bch2_reconstruct_alloc(struct bch_fs
*c
)
53 bch2_journal_log_msg(c
, "dropping alloc info");
54 bch_info(c
, "dropping and reconstructing all alloc info");
56 mutex_lock(&c
->sb_lock
);
57 struct bch_sb_field_ext
*ext
= bch2_sb_field_get(c
->disk_sb
.sb
, ext
);
59 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations
, ext
->recovery_passes_required
);
60 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info
, ext
->recovery_passes_required
);
61 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus
, ext
->recovery_passes_required
);
62 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers
, ext
->recovery_passes_required
);
63 __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs
, ext
->recovery_passes_required
);
65 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key
, ext
->errors_silent
);
66 __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen
, ext
->errors_silent
);
67 __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr
, ext
->errors_silent
);
68 __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong
, ext
->errors_silent
);
69 __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong
, ext
->errors_silent
);
70 __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong
, ext
->errors_silent
);
71 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong
, ext
->errors_silent
);
72 __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong
, ext
->errors_silent
);
73 __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong
, ext
->errors_silent
);
74 __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong
, ext
->errors_silent
);
75 __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong
, ext
->errors_silent
);
76 __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing
, ext
->errors_silent
);
77 __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer
, ext
->errors_silent
);
78 __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad
, ext
->errors_silent
);
79 c
->sb
.compat
&= ~(1ULL << BCH_COMPAT_alloc_info
);
82 mutex_unlock(&c
->sb_lock
);
84 c
->recovery_passes_explicit
|= bch2_recovery_passes_from_stable(le64_to_cpu(ext
->recovery_passes_required
[0]));
87 bch2_shoot_down_journal_keys(c
, BTREE_ID_alloc
,
88 0, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
89 bch2_shoot_down_journal_keys(c
, BTREE_ID_backpointers
,
90 0, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
91 bch2_shoot_down_journal_keys(c
, BTREE_ID_need_discard
,
92 0, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
93 bch2_shoot_down_journal_keys(c
, BTREE_ID_freespace
,
94 0, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
95 bch2_shoot_down_journal_keys(c
, BTREE_ID_bucket_gens
,
96 0, BTREE_MAX_DEPTH
, POS_MIN
, SPOS_MAX
);
100 * Btree node pointers have a field to stack a pointer to the in memory btree
101 * node; we need to zero out this field when reading in btree nodes, or when
102 * reading in keys from the journal:
104 static void zero_out_btree_mem_ptr(struct journal_keys
*keys
)
106 darray_for_each(*keys
, i
)
107 if (i
->k
->k
.type
== KEY_TYPE_btree_ptr_v2
)
108 bkey_i_to_btree_ptr_v2(i
->k
)->v
.mem_ptr
= 0;
111 /* journal replay: */
113 static void replay_now_at(struct journal
*j
, u64 seq
)
115 BUG_ON(seq
< j
->replay_journal_seq
);
117 seq
= min(seq
, j
->replay_journal_seq_end
);
119 while (j
->replay_journal_seq
< seq
)
120 bch2_journal_pin_put(j
, j
->replay_journal_seq
++);
123 static int bch2_journal_replay_key(struct btree_trans
*trans
,
124 struct journal_key
*k
)
126 struct btree_iter iter
;
127 unsigned iter_flags
=
129 BTREE_ITER_NOT_EXTENTS
;
130 unsigned update_flags
= BTREE_TRIGGER_NORUN
;
136 trans
->journal_res
.seq
= k
->journal_seq
;
139 * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
140 * keep the key cache coherent with the underlying btree. Nothing
141 * besides the allocator is doing updates yet so we don't need key cache
142 * coherency for non-alloc btrees, and key cache fills for snapshots
143 * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
144 * the snapshots recovery pass runs.
146 if (!k
->level
&& k
->btree_id
== BTREE_ID_alloc
)
147 iter_flags
|= BTREE_ITER_CACHED
;
149 update_flags
|= BTREE_UPDATE_KEY_CACHE_RECLAIM
;
151 bch2_trans_node_iter_init(trans
, &iter
, k
->btree_id
, k
->k
->k
.p
,
152 BTREE_MAX_DEPTH
, k
->level
,
154 ret
= bch2_btree_iter_traverse(&iter
);
158 struct btree_path
*path
= btree_iter_path(trans
, &iter
);
159 if (unlikely(!btree_path_node(path
, k
->level
))) {
160 bch2_trans_iter_exit(trans
, &iter
);
161 bch2_trans_node_iter_init(trans
, &iter
, k
->btree_id
, k
->k
->k
.p
,
162 BTREE_MAX_DEPTH
, 0, iter_flags
);
163 ret
= bch2_btree_iter_traverse(&iter
) ?:
164 bch2_btree_increase_depth(trans
, iter
.path
, 0) ?:
165 -BCH_ERR_transaction_restart_nested
;
169 /* Must be checked with btree locked: */
173 ret
= bch2_trans_update(trans
, &iter
, k
->k
, update_flags
);
175 bch2_trans_iter_exit(trans
, &iter
);
179 static int journal_sort_seq_cmp(const void *_l
, const void *_r
)
181 const struct journal_key
*l
= *((const struct journal_key
**)_l
);
182 const struct journal_key
*r
= *((const struct journal_key
**)_r
);
184 return cmp_int(l
->journal_seq
, r
->journal_seq
);
187 int bch2_journal_replay(struct bch_fs
*c
)
189 struct journal_keys
*keys
= &c
->journal_keys
;
190 DARRAY(struct journal_key
*) keys_sorted
= { 0 };
191 struct journal
*j
= &c
->journal
;
192 u64 start_seq
= c
->journal_replay_seq_start
;
193 u64 end_seq
= c
->journal_replay_seq_start
;
194 struct btree_trans
*trans
= bch2_trans_get(c
);
195 bool immediate_flush
= false;
199 ret
= bch2_journal_log_msg(c
, "Starting journal replay (%zu keys in entries %llu-%llu)",
200 keys
->nr
, start_seq
, end_seq
);
205 BUG_ON(!atomic_read(&keys
->ref
));
207 move_gap(keys
, keys
->nr
);
210 * First, attempt to replay keys in sorted order. This is more
211 * efficient - better locality of btree access - but some might fail if
212 * that would cause a journal deadlock.
214 darray_for_each(*keys
, k
) {
218 * k->allocated means the key wasn't read in from the journal,
219 * rather it was from early repair code
222 immediate_flush
= true;
224 /* Skip fastpath if we're low on space in the journal */
225 ret
= c
->journal
.watermark
? -1 :
226 commit_do(trans
, NULL
, NULL
,
227 BCH_TRANS_COMMIT_no_enospc
|
228 BCH_TRANS_COMMIT_journal_reclaim
|
229 (!k
->allocated
? BCH_TRANS_COMMIT_no_journal_res
: 0),
230 bch2_journal_replay_key(trans
, k
));
231 BUG_ON(!ret
&& !k
->overwritten
);
233 ret
= darray_push(&keys_sorted
, k
);
240 * Now, replay any remaining keys in the order in which they appear in
241 * the journal, unpinning those journal entries as we go:
243 sort(keys_sorted
.data
, keys_sorted
.nr
,
244 sizeof(keys_sorted
.data
[0]),
245 journal_sort_seq_cmp
, NULL
);
247 darray_for_each(keys_sorted
, kp
) {
250 struct journal_key
*k
= *kp
;
252 replay_now_at(j
, k
->journal_seq
);
254 ret
= commit_do(trans
, NULL
, NULL
,
255 BCH_TRANS_COMMIT_no_enospc
|
257 ? BCH_TRANS_COMMIT_no_journal_res
|BCH_WATERMARK_reclaim
259 bch2_journal_replay_key(trans
, k
));
260 bch_err_msg(c
, ret
, "while replaying key at btree %s level %u:",
261 bch2_btree_id_str(k
->btree_id
), k
->level
);
265 BUG_ON(!k
->overwritten
);
269 * We need to put our btree_trans before calling flush_all_pins(), since
270 * that will use a btree_trans internally
272 bch2_trans_put(trans
);
275 if (!c
->opts
.retain_recovery_info
&&
276 c
->recovery_pass_done
>= BCH_RECOVERY_PASS_journal_replay
)
277 bch2_journal_keys_put_initial(c
);
279 replay_now_at(j
, j
->replay_journal_seq_end
);
280 j
->replay_journal_seq
= 0;
282 bch2_journal_set_replay_done(j
);
284 /* if we did any repair, flush it immediately */
285 if (immediate_flush
) {
286 bch2_journal_flush_all_pins(&c
->journal
);
287 ret
= bch2_journal_meta(&c
->journal
);
291 bch2_journal_log_msg(c
, "journal replay finished");
294 bch2_trans_put(trans
);
295 darray_exit(&keys_sorted
);
300 /* journal replay early: */
302 static int journal_replay_entry_early(struct bch_fs
*c
,
303 struct jset_entry
*entry
)
307 switch (entry
->type
) {
308 case BCH_JSET_ENTRY_btree_root
: {
309 struct btree_root
*r
;
311 while (entry
->btree_id
>= c
->btree_roots_extra
.nr
+ BTREE_ID_NR
) {
312 ret
= darray_push(&c
->btree_roots_extra
, (struct btree_root
) { NULL
});
317 r
= bch2_btree_id_root(c
, entry
->btree_id
);
320 r
->level
= entry
->level
;
321 bkey_copy(&r
->key
, (struct bkey_i
*) entry
->start
);
324 r
->error
= -BCH_ERR_btree_node_read_error
;
329 case BCH_JSET_ENTRY_usage
: {
330 struct jset_entry_usage
*u
=
331 container_of(entry
, struct jset_entry_usage
, entry
);
333 switch (entry
->btree_id
) {
334 case BCH_FS_USAGE_reserved
:
335 if (entry
->level
< BCH_REPLICAS_MAX
)
336 c
->usage_base
->persistent_reserved
[entry
->level
] =
339 case BCH_FS_USAGE_inodes
:
340 c
->usage_base
->b
.nr_inodes
= le64_to_cpu(u
->v
);
342 case BCH_FS_USAGE_key_version
:
343 atomic64_set(&c
->key_version
,
350 case BCH_JSET_ENTRY_data_usage
: {
351 struct jset_entry_data_usage
*u
=
352 container_of(entry
, struct jset_entry_data_usage
, entry
);
354 ret
= bch2_replicas_set_usage(c
, &u
->r
,
358 case BCH_JSET_ENTRY_dev_usage
: {
359 struct jset_entry_dev_usage
*u
=
360 container_of(entry
, struct jset_entry_dev_usage
, entry
);
361 struct bch_dev
*ca
= bch_dev_bkey_exists(c
, le32_to_cpu(u
->dev
));
362 unsigned i
, nr_types
= jset_entry_dev_usage_nr_types(u
);
364 for (i
= 0; i
< min_t(unsigned, nr_types
, BCH_DATA_NR
); i
++) {
365 ca
->usage_base
->d
[i
].buckets
= le64_to_cpu(u
->d
[i
].buckets
);
366 ca
->usage_base
->d
[i
].sectors
= le64_to_cpu(u
->d
[i
].sectors
);
367 ca
->usage_base
->d
[i
].fragmented
= le64_to_cpu(u
->d
[i
].fragmented
);
372 case BCH_JSET_ENTRY_blacklist
: {
373 struct jset_entry_blacklist
*bl_entry
=
374 container_of(entry
, struct jset_entry_blacklist
, entry
);
376 ret
= bch2_journal_seq_blacklist_add(c
,
377 le64_to_cpu(bl_entry
->seq
),
378 le64_to_cpu(bl_entry
->seq
) + 1);
381 case BCH_JSET_ENTRY_blacklist_v2
: {
382 struct jset_entry_blacklist_v2
*bl_entry
=
383 container_of(entry
, struct jset_entry_blacklist_v2
, entry
);
385 ret
= bch2_journal_seq_blacklist_add(c
,
386 le64_to_cpu(bl_entry
->start
),
387 le64_to_cpu(bl_entry
->end
) + 1);
390 case BCH_JSET_ENTRY_clock
: {
391 struct jset_entry_clock
*clock
=
392 container_of(entry
, struct jset_entry_clock
, entry
);
394 atomic64_set(&c
->io_clock
[clock
->rw
].now
, le64_to_cpu(clock
->time
));
401 static int journal_replay_early(struct bch_fs
*c
,
402 struct bch_sb_field_clean
*clean
)
405 for (struct jset_entry
*entry
= clean
->start
;
406 entry
!= vstruct_end(&clean
->field
);
407 entry
= vstruct_next(entry
)) {
408 int ret
= journal_replay_entry_early(c
, entry
);
413 struct genradix_iter iter
;
414 struct journal_replay
*i
, **_i
;
416 genradix_for_each(&c
->journal_entries
, iter
, _i
) {
419 if (journal_replay_ignore(i
))
422 vstruct_for_each(&i
->j
, entry
) {
423 int ret
= journal_replay_entry_early(c
, entry
);
430 bch2_fs_usage_initialize(c
);
435 /* sb clean section: */
437 static int read_btree_roots(struct bch_fs
*c
)
441 for (unsigned i
= 0; i
< btree_id_nr_alive(c
); i
++) {
442 struct btree_root
*r
= bch2_btree_id_root(c
, i
);
447 if (btree_id_is_alloc(i
) && c
->opts
.reconstruct_alloc
)
450 if (mustfix_fsck_err_on((ret
= r
->error
),
451 c
, btree_root_bkey_invalid
,
452 "invalid btree root %s",
453 bch2_btree_id_str(i
)) ||
454 mustfix_fsck_err_on((ret
= r
->error
= bch2_btree_root_read(c
, i
, &r
->key
, r
->level
)),
455 c
, btree_root_read_error
,
456 "error reading btree root %s l=%u: %s",
457 bch2_btree_id_str(i
), r
->level
, bch2_err_str(ret
))) {
458 if (btree_id_is_alloc(i
)) {
459 c
->recovery_passes_explicit
|= BIT_ULL(BCH_RECOVERY_PASS_check_allocations
);
460 c
->recovery_passes_explicit
|= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info
);
461 c
->recovery_passes_explicit
|= BIT_ULL(BCH_RECOVERY_PASS_check_lrus
);
462 c
->recovery_passes_explicit
|= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers
);
463 c
->recovery_passes_explicit
|= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs
);
464 c
->sb
.compat
&= ~(1ULL << BCH_COMPAT_alloc_info
);
466 } else if (!(c
->recovery_passes_explicit
& BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes
))) {
467 bch_info(c
, "will run btree node scan");
468 c
->recovery_passes_explicit
|= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes
);
469 c
->recovery_passes_explicit
|= BIT_ULL(BCH_RECOVERY_PASS_check_topology
);
476 for (unsigned i
= 0; i
< BTREE_ID_NR
; i
++) {
477 struct btree_root
*r
= bch2_btree_id_root(c
, i
);
479 if (!r
->b
&& !r
->error
) {
482 bch2_btree_root_alloc_fake(c
, i
, 0);
489 static bool check_version_upgrade(struct bch_fs
*c
)
491 unsigned latest_version
= bcachefs_metadata_version_current
;
492 unsigned latest_compatible
= min(latest_version
,
493 bch2_latest_compatible_version(c
->sb
.version
));
494 unsigned old_version
= c
->sb
.version_upgrade_complete
?: c
->sb
.version
;
495 unsigned new_version
= 0;
497 if (old_version
< bcachefs_metadata_required_upgrade_below
) {
498 if (c
->opts
.version_upgrade
== BCH_VERSION_UPGRADE_incompatible
||
499 latest_compatible
< bcachefs_metadata_required_upgrade_below
)
500 new_version
= latest_version
;
502 new_version
= latest_compatible
;
504 switch (c
->opts
.version_upgrade
) {
505 case BCH_VERSION_UPGRADE_compatible
:
506 new_version
= latest_compatible
;
508 case BCH_VERSION_UPGRADE_incompatible
:
509 new_version
= latest_version
;
511 case BCH_VERSION_UPGRADE_none
:
512 new_version
= min(old_version
, latest_version
);
517 if (new_version
> old_version
) {
518 struct printbuf buf
= PRINTBUF
;
520 if (old_version
< bcachefs_metadata_required_upgrade_below
)
521 prt_str(&buf
, "Version upgrade required:\n");
523 if (old_version
!= c
->sb
.version
) {
524 prt_str(&buf
, "Version upgrade from ");
525 bch2_version_to_text(&buf
, c
->sb
.version_upgrade_complete
);
526 prt_str(&buf
, " to ");
527 bch2_version_to_text(&buf
, c
->sb
.version
);
528 prt_str(&buf
, " incomplete\n");
531 prt_printf(&buf
, "Doing %s version upgrade from ",
532 BCH_VERSION_MAJOR(old_version
) != BCH_VERSION_MAJOR(new_version
)
533 ? "incompatible" : "compatible");
534 bch2_version_to_text(&buf
, old_version
);
535 prt_str(&buf
, " to ");
536 bch2_version_to_text(&buf
, new_version
);
539 struct bch_sb_field_ext
*ext
= bch2_sb_field_get(c
->disk_sb
.sb
, ext
);
540 __le64 passes
= ext
->recovery_passes_required
[0];
541 bch2_sb_set_upgrade(c
, old_version
, new_version
);
542 passes
= ext
->recovery_passes_required
[0] & ~passes
;
545 prt_str(&buf
, " running recovery passes: ");
546 prt_bitflags(&buf
, bch2_recovery_passes
,
547 bch2_recovery_passes_from_stable(le64_to_cpu(passes
)));
550 bch_info(c
, "%s", buf
.buf
);
552 bch2_sb_upgrade(c
, new_version
);
561 int bch2_fs_recovery(struct bch_fs
*c
)
563 struct bch_sb_field_clean
*clean
= NULL
;
564 struct jset
*last_journal_entry
= NULL
;
565 u64 last_seq
= 0, blacklist_seq
, journal_seq
;
569 clean
= bch2_read_superblock_clean(c
);
570 ret
= PTR_ERR_OR_ZERO(clean
);
574 bch_info(c
, "recovering from clean shutdown, journal seq %llu",
575 le64_to_cpu(clean
->journal_seq
));
577 bch_info(c
, "recovering from unclean shutdown");
580 if (!(c
->sb
.features
& (1ULL << BCH_FEATURE_new_extent_overwrite
))) {
581 bch_err(c
, "feature new_extent_overwrite not set, filesystem no longer supported");
587 !(c
->sb
.features
& (1ULL << BCH_FEATURE_extents_above_btree_updates
))) {
588 bch_err(c
, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
593 if (c
->opts
.norecovery
)
594 c
->opts
.recovery_pass_last
= BCH_RECOVERY_PASS_journal_replay
- 1;
596 if (!c
->opts
.nochanges
) {
597 mutex_lock(&c
->sb_lock
);
598 struct bch_sb_field_ext
*ext
= bch2_sb_field_get(c
->disk_sb
.sb
, ext
);
599 bool write_sb
= false;
601 if (BCH_SB_HAS_TOPOLOGY_ERRORS(c
->disk_sb
.sb
)) {
602 ext
->recovery_passes_required
[0] |=
603 cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology
)));
607 u64 sb_passes
= bch2_recovery_passes_from_stable(le64_to_cpu(ext
->recovery_passes_required
[0]));
609 struct printbuf buf
= PRINTBUF
;
610 prt_str(&buf
, "superblock requires following recovery passes to be run:\n ");
611 prt_bitflags(&buf
, bch2_recovery_passes
, sb_passes
);
612 bch_info(c
, "%s", buf
.buf
);
616 if (bch2_check_version_downgrade(c
)) {
617 struct printbuf buf
= PRINTBUF
;
619 prt_str(&buf
, "Version downgrade required:");
621 __le64 passes
= ext
->recovery_passes_required
[0];
622 bch2_sb_set_downgrade(c
,
623 BCH_VERSION_MINOR(bcachefs_metadata_version_current
),
624 BCH_VERSION_MINOR(c
->sb
.version
));
625 passes
= ext
->recovery_passes_required
[0] & ~passes
;
627 prt_str(&buf
, "\n running recovery passes: ");
628 prt_bitflags(&buf
, bch2_recovery_passes
,
629 bch2_recovery_passes_from_stable(le64_to_cpu(passes
)));
632 bch_info(c
, "%s", buf
.buf
);
637 if (check_version_upgrade(c
))
643 c
->recovery_passes_explicit
|= bch2_recovery_passes_from_stable(le64_to_cpu(ext
->recovery_passes_required
[0]));
644 mutex_unlock(&c
->sb_lock
);
647 if (c
->opts
.fsck
&& IS_ENABLED(CONFIG_BCACHEFS_DEBUG
))
648 c
->recovery_passes_explicit
|= BIT_ULL(BCH_RECOVERY_PASS_check_topology
);
651 set_bit(BCH_FS_fsck_running
, &c
->flags
);
653 ret
= bch2_blacklist_table_initialize(c
);
655 bch_err(c
, "error initializing blacklist table");
659 if (!c
->sb
.clean
|| c
->opts
.fsck
|| c
->opts
.retain_recovery_info
) {
660 struct genradix_iter iter
;
661 struct journal_replay
**i
;
663 bch_verbose(c
, "starting journal read");
664 ret
= bch2_journal_read(c
, &last_seq
, &blacklist_seq
, &journal_seq
);
669 * note: cmd_list_journal needs the blacklist table fully up to date so
670 * it can asterisk ignored journal entries:
672 if (c
->opts
.read_journal_only
)
675 genradix_for_each_reverse(&c
->journal_entries
, iter
, i
)
676 if (!journal_replay_ignore(*i
)) {
677 last_journal_entry
= &(*i
)->j
;
681 if (mustfix_fsck_err_on(c
->sb
.clean
&&
682 last_journal_entry
&&
683 !journal_entry_empty(last_journal_entry
), c
,
684 clean_but_journal_not_empty
,
685 "filesystem marked clean but journal not empty")) {
686 c
->sb
.compat
&= ~(1ULL << BCH_COMPAT_alloc_info
);
687 SET_BCH_SB_CLEAN(c
->disk_sb
.sb
, false);
691 if (!last_journal_entry
) {
692 fsck_err_on(!c
->sb
.clean
, c
,
693 dirty_but_no_journal_entries
,
694 "no journal entries found");
698 genradix_for_each_reverse(&c
->journal_entries
, iter
, i
)
700 last_journal_entry
= &(*i
)->j
;
701 (*i
)->ignore_blacklisted
= false;
702 (*i
)->ignore_not_dirty
= false;
704 * This was probably a NO_FLUSH entry,
705 * so last_seq was garbage - but we know
706 * we're only using a single journal
707 * entry, set it here:
709 (*i
)->j
.last_seq
= (*i
)->j
.seq
;
714 ret
= bch2_journal_keys_sort(c
);
718 if (c
->sb
.clean
&& last_journal_entry
) {
719 ret
= bch2_verify_superblock_clean(c
, &clean
,
727 bch_err(c
, "no superblock clean section found");
728 ret
= -BCH_ERR_fsck_repair_impossible
;
732 blacklist_seq
= journal_seq
= le64_to_cpu(clean
->journal_seq
) + 1;
735 c
->journal_replay_seq_start
= last_seq
;
736 c
->journal_replay_seq_end
= blacklist_seq
- 1;
738 if (c
->opts
.reconstruct_alloc
)
739 bch2_reconstruct_alloc(c
);
741 zero_out_btree_mem_ptr(&c
->journal_keys
);
743 ret
= journal_replay_early(c
, clean
);
748 * After an unclean shutdown, skip then next few journal sequence
749 * numbers as they may have been referenced by btree writes that
750 * happened before their corresponding journal writes - those btree
751 * writes need to be ignored, by skipping and blacklisting the next few
752 * journal sequence numbers:
757 if (blacklist_seq
!= journal_seq
) {
758 ret
= bch2_journal_log_msg(c
, "blacklisting entries %llu-%llu",
759 blacklist_seq
, journal_seq
) ?:
760 bch2_journal_seq_blacklist_add(c
,
761 blacklist_seq
, journal_seq
);
763 bch_err_msg(c
, ret
, "error creating new journal seq blacklist entry");
768 ret
= bch2_journal_log_msg(c
, "starting journal at entry %llu, replaying %llu-%llu",
769 journal_seq
, last_seq
, blacklist_seq
- 1) ?:
770 bch2_fs_journal_start(&c
->journal
, journal_seq
);
775 * Skip past versions that might have possibly been used (as nonces),
776 * but hadn't had their pointers written:
778 if (c
->sb
.encryption_type
&& !c
->sb
.clean
)
779 atomic64_add(1 << 16, &c
->key_version
);
781 ret
= read_btree_roots(c
);
785 ret
= bch2_run_recovery_passes(c
);
789 clear_bit(BCH_FS_fsck_running
, &c
->flags
);
791 /* fsync if we fixed errors */
792 if (test_bit(BCH_FS_errors_fixed
, &c
->flags
)) {
793 bch2_journal_flush_all_pins(&c
->journal
);
794 bch2_journal_meta(&c
->journal
);
797 /* If we fixed errors, verify that fs is actually clean now: */
798 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG
) &&
799 test_bit(BCH_FS_errors_fixed
, &c
->flags
) &&
800 !test_bit(BCH_FS_errors_not_fixed
, &c
->flags
) &&
801 !test_bit(BCH_FS_error
, &c
->flags
)) {
802 bch2_flush_fsck_errs(c
);
804 bch_info(c
, "Fixed errors, running fsck a second time to verify fs is clean");
805 clear_bit(BCH_FS_errors_fixed
, &c
->flags
);
807 c
->curr_recovery_pass
= BCH_RECOVERY_PASS_check_alloc_info
;
809 ret
= bch2_run_recovery_passes(c
);
813 if (test_bit(BCH_FS_errors_fixed
, &c
->flags
) ||
814 test_bit(BCH_FS_errors_not_fixed
, &c
->flags
)) {
815 bch_err(c
, "Second fsck run was not clean");
816 set_bit(BCH_FS_errors_not_fixed
, &c
->flags
);
819 set_bit(BCH_FS_errors_fixed
, &c
->flags
);
822 if (enabled_qtypes(c
)) {
823 bch_verbose(c
, "reading quotas");
824 ret
= bch2_fs_quota_read(c
);
827 bch_verbose(c
, "quotas done");
830 mutex_lock(&c
->sb_lock
);
831 struct bch_sb_field_ext
*ext
= bch2_sb_field_get(c
->disk_sb
.sb
, ext
);
832 bool write_sb
= false;
834 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c
->disk_sb
.sb
) != le16_to_cpu(c
->disk_sb
.sb
->version
)) {
835 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c
->disk_sb
.sb
, le16_to_cpu(c
->disk_sb
.sb
->version
));
839 if (!test_bit(BCH_FS_error
, &c
->flags
) &&
840 !(c
->disk_sb
.sb
->compat
[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info
))) {
841 c
->disk_sb
.sb
->compat
[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info
);
845 if (!test_bit(BCH_FS_error
, &c
->flags
) &&
846 !bch2_is_zero(ext
->errors_silent
, sizeof(ext
->errors_silent
))) {
847 memset(ext
->errors_silent
, 0, sizeof(ext
->errors_silent
));
852 !test_bit(BCH_FS_error
, &c
->flags
) &&
853 !test_bit(BCH_FS_errors_not_fixed
, &c
->flags
)) {
854 SET_BCH_SB_HAS_ERRORS(c
->disk_sb
.sb
, 0);
855 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c
->disk_sb
.sb
, 0);
861 mutex_unlock(&c
->sb_lock
);
863 if (!(c
->sb
.compat
& (1ULL << BCH_COMPAT_extents_above_btree_updates_done
)) ||
864 c
->sb
.version_min
< bcachefs_metadata_version_btree_ptr_sectors_written
) {
865 struct bch_move_stats stats
;
867 bch2_move_stats_init(&stats
, "recovery");
869 struct printbuf buf
= PRINTBUF
;
870 bch2_version_to_text(&buf
, c
->sb
.version_min
);
871 bch_info(c
, "scanning for old btree nodes: min_version %s", buf
.buf
);
874 ret
= bch2_fs_read_write_early(c
) ?:
875 bch2_scan_old_btree_nodes(c
, &stats
);
878 bch_info(c
, "scanning for old btree nodes done");
881 if (c
->journal_seq_blacklist_table
&&
882 c
->journal_seq_blacklist_table
->nr
> 128)
883 queue_work(system_long_wq
, &c
->journal_seq_blacklist_gc_work
);
887 bch2_flush_fsck_errs(c
);
889 if (!c
->opts
.retain_recovery_info
) {
890 bch2_journal_keys_put_initial(c
);
891 bch2_find_btree_nodes_exit(&c
->found_btree_nodes
);
896 test_bit(BCH_FS_need_delete_dead_snapshots
, &c
->flags
) &&
897 !c
->opts
.nochanges
) {
898 bch2_fs_read_write_early(c
);
899 bch2_delete_dead_snapshots_async(c
);
906 bch2_fs_emergency_read_only(c
);
910 int bch2_fs_initialize(struct bch_fs
*c
)
912 struct bch_inode_unpacked root_inode
, lostfound_inode
;
913 struct bkey_inode_buf packed_inode
;
914 struct qstr lostfound
= QSTR("lost+found");
917 bch_notice(c
, "initializing new filesystem");
919 mutex_lock(&c
->sb_lock
);
920 c
->disk_sb
.sb
->compat
[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done
);
921 c
->disk_sb
.sb
->compat
[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done
);
923 bch2_check_version_downgrade(c
);
925 if (c
->opts
.version_upgrade
!= BCH_VERSION_UPGRADE_none
) {
926 bch2_sb_upgrade(c
, bcachefs_metadata_version_current
);
927 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c
->disk_sb
.sb
, bcachefs_metadata_version_current
);
930 mutex_unlock(&c
->sb_lock
);
932 c
->curr_recovery_pass
= BCH_RECOVERY_PASS_NR
;
933 set_bit(BCH_FS_may_go_rw
, &c
->flags
);
935 for (unsigned i
= 0; i
< BTREE_ID_NR
; i
++)
936 bch2_btree_root_alloc_fake(c
, i
, 0);
938 for_each_member_device(c
, ca
)
939 bch2_dev_usage_init(ca
);
941 ret
= bch2_fs_journal_alloc(c
);
946 * journal_res_get() will crash if called before this has
947 * set up the journal.pin FIFO and journal.cur pointer:
949 bch2_fs_journal_start(&c
->journal
, 1);
950 bch2_journal_set_replay_done(&c
->journal
);
952 ret
= bch2_fs_read_write_early(c
);
957 * Write out the superblock and journal buckets, now that we can do
960 bch_verbose(c
, "marking superblocks");
961 ret
= bch2_trans_mark_dev_sbs(c
);
962 bch_err_msg(c
, ret
, "marking superblocks");
966 for_each_online_member(c
, ca
)
967 ca
->new_fs_bucket_idx
= 0;
969 ret
= bch2_fs_freespace_init(c
);
973 ret
= bch2_initialize_subvolumes(c
);
977 bch_verbose(c
, "reading snapshots table");
978 ret
= bch2_snapshots_read(c
);
981 bch_verbose(c
, "reading snapshots done");
983 bch2_inode_init(c
, &root_inode
, 0, 0, S_IFDIR
|0755, 0, NULL
);
984 root_inode
.bi_inum
= BCACHEFS_ROOT_INO
;
985 root_inode
.bi_subvol
= BCACHEFS_ROOT_SUBVOL
;
986 bch2_inode_pack(&packed_inode
, &root_inode
);
987 packed_inode
.inode
.k
.p
.snapshot
= U32_MAX
;
989 ret
= bch2_btree_insert(c
, BTREE_ID_inodes
, &packed_inode
.inode
.k_i
, NULL
, 0);
990 bch_err_msg(c
, ret
, "creating root directory");
994 bch2_inode_init_early(c
, &lostfound_inode
);
996 ret
= bch2_trans_do(c
, NULL
, NULL
, 0,
997 bch2_create_trans(trans
,
998 BCACHEFS_ROOT_SUBVOL_INUM
,
999 &root_inode
, &lostfound_inode
,
1001 0, 0, S_IFDIR
|0700, 0,
1002 NULL
, NULL
, (subvol_inum
) { 0 }, 0));
1003 bch_err_msg(c
, ret
, "creating lost+found");
1007 c
->recovery_pass_done
= BCH_RECOVERY_PASS_NR
- 1;
1009 if (enabled_qtypes(c
)) {
1010 ret
= bch2_fs_quota_read(c
);
1015 ret
= bch2_journal_flush(&c
->journal
);
1016 bch_err_msg(c
, ret
, "writing first journal entry");
1020 mutex_lock(&c
->sb_lock
);
1021 SET_BCH_SB_INITIALIZED(c
->disk_sb
.sb
, true);
1022 SET_BCH_SB_CLEAN(c
->disk_sb
.sb
, false);
1024 bch2_write_super(c
);
1025 mutex_unlock(&c
->sb_lock
);