1 // SPDX-License-Identifier: GPL-2.0
5 #include "disk_groups.h"
9 #include "journal_sb.h"
10 #include "journal_seq_blacklist.h"
15 #include "sb-counters.h"
16 #include "sb-downgrade.h"
17 #include "sb-errors.h"
18 #include "sb-members.h"
24 #include <linux/backing-dev.h>
25 #include <linux/sort.h>
27 static const struct blk_holder_ops bch2_sb_handle_bdev_ops
= {
30 struct bch2_metadata_version
{
35 static const struct bch2_metadata_version bch2_metadata_versions
[] = {
40 BCH_METADATA_VERSIONS()
44 void bch2_version_to_text(struct printbuf
*out
, unsigned v
)
46 const char *str
= "(unknown version)";
48 for (unsigned i
= 0; i
< ARRAY_SIZE(bch2_metadata_versions
); i
++)
49 if (bch2_metadata_versions
[i
].version
== v
) {
50 str
= bch2_metadata_versions
[i
].name
;
54 prt_printf(out
, "%u.%u: %s", BCH_VERSION_MAJOR(v
), BCH_VERSION_MINOR(v
), str
);
57 unsigned bch2_latest_compatible_version(unsigned v
)
59 if (!BCH_VERSION_MAJOR(v
))
62 for (unsigned i
= 0; i
< ARRAY_SIZE(bch2_metadata_versions
); i
++)
63 if (bch2_metadata_versions
[i
].version
> v
&&
64 BCH_VERSION_MAJOR(bch2_metadata_versions
[i
].version
) ==
66 v
= bch2_metadata_versions
[i
].version
;
71 const char * const bch2_sb_fields
[] = {
72 #define x(name, nr) #name,
78 static int bch2_sb_field_validate(struct bch_sb
*, struct bch_sb_field
*,
81 struct bch_sb_field
*bch2_sb_field_get_id(struct bch_sb
*sb
,
82 enum bch_sb_field_type type
)
84 /* XXX: need locking around superblock to access optional fields */
86 vstruct_for_each(sb
, f
)
87 if (le32_to_cpu(f
->type
) == type
)
92 static struct bch_sb_field
*__bch2_sb_field_resize(struct bch_sb_handle
*sb
,
93 struct bch_sb_field
*f
,
96 unsigned old_u64s
= f
? le32_to_cpu(f
->u64s
) : 0;
97 unsigned sb_u64s
= le32_to_cpu(sb
->sb
->u64s
) + u64s
- old_u64s
;
99 BUG_ON(__vstruct_bytes(struct bch_sb
, sb_u64s
) > sb
->buffer_size
);
104 f
= vstruct_last(sb
->sb
);
105 memset(f
, 0, sizeof(u64
) * u64s
);
106 f
->u64s
= cpu_to_le32(u64s
);
111 src
= vstruct_end(f
);
114 f
->u64s
= cpu_to_le32(u64s
);
115 dst
= vstruct_end(f
);
120 memmove(dst
, src
, vstruct_end(sb
->sb
) - src
);
123 memset(src
, 0, dst
- src
);
126 sb
->sb
->u64s
= cpu_to_le32(sb_u64s
);
128 return u64s
? f
: NULL
;
131 void bch2_sb_field_delete(struct bch_sb_handle
*sb
,
132 enum bch_sb_field_type type
)
134 struct bch_sb_field
*f
= bch2_sb_field_get_id(sb
->sb
, type
);
137 __bch2_sb_field_resize(sb
, f
, 0);
140 /* Superblock realloc/free: */
142 void bch2_free_super(struct bch_sb_handle
*sb
)
145 if (!IS_ERR_OR_NULL(sb
->s_bdev_file
))
146 fput(sb
->s_bdev_file
);
151 memset(sb
, 0, sizeof(*sb
));
154 int bch2_sb_realloc(struct bch_sb_handle
*sb
, unsigned u64s
)
156 size_t new_bytes
= __vstruct_bytes(struct bch_sb
, u64s
);
157 size_t new_buffer_size
;
158 struct bch_sb
*new_sb
;
162 new_bytes
= max_t(size_t, new_bytes
, bdev_logical_block_size(sb
->bdev
));
164 new_buffer_size
= roundup_pow_of_two(new_bytes
);
166 if (sb
->sb
&& sb
->buffer_size
>= new_buffer_size
)
169 if (sb
->sb
&& sb
->have_layout
) {
170 u64 max_bytes
= 512 << sb
->sb
->layout
.sb_max_size_bits
;
172 if (new_bytes
> max_bytes
) {
173 struct printbuf buf
= PRINTBUF
;
175 prt_bdevname(&buf
, sb
->bdev
);
176 prt_printf(&buf
, ": superblock too big: want %zu but have %llu", new_bytes
, max_bytes
);
177 pr_err("%s", buf
.buf
);
179 return -BCH_ERR_ENOSPC_sb
;
183 if (sb
->buffer_size
>= new_buffer_size
&& sb
->sb
)
186 if (dynamic_fault("bcachefs:add:super_realloc"))
187 return -BCH_ERR_ENOMEM_sb_realloc_injected
;
189 new_sb
= krealloc(sb
->sb
, new_buffer_size
, GFP_NOFS
|__GFP_ZERO
);
191 return -BCH_ERR_ENOMEM_sb_buf_realloc
;
196 unsigned nr_bvecs
= buf_pages(sb
->sb
, new_buffer_size
);
198 bio
= bio_kmalloc(nr_bvecs
, GFP_KERNEL
);
200 return -BCH_ERR_ENOMEM_sb_bio_realloc
;
202 bio_init(bio
, NULL
, bio
->bi_inline_vecs
, nr_bvecs
, 0);
208 sb
->buffer_size
= new_buffer_size
;
213 struct bch_sb_field
*bch2_sb_field_resize_id(struct bch_sb_handle
*sb
,
214 enum bch_sb_field_type type
,
217 struct bch_sb_field
*f
= bch2_sb_field_get_id(sb
->sb
, type
);
218 ssize_t old_u64s
= f
? le32_to_cpu(f
->u64s
) : 0;
219 ssize_t d
= -old_u64s
+ u64s
;
221 if (bch2_sb_realloc(sb
, le32_to_cpu(sb
->sb
->u64s
) + d
))
225 struct bch_fs
*c
= container_of(sb
, struct bch_fs
, disk_sb
);
227 lockdep_assert_held(&c
->sb_lock
);
229 /* XXX: we're not checking that offline device have enough space */
231 for_each_online_member(c
, ca
) {
232 struct bch_sb_handle
*dev_sb
= &ca
->disk_sb
;
234 if (bch2_sb_realloc(dev_sb
, le32_to_cpu(dev_sb
->sb
->u64s
) + d
)) {
235 percpu_ref_put(&ca
->ref
);
241 f
= bch2_sb_field_get_id(sb
->sb
, type
);
242 f
= __bch2_sb_field_resize(sb
, f
, u64s
);
244 f
->type
= cpu_to_le32(type
);
248 struct bch_sb_field
*bch2_sb_field_get_minsize_id(struct bch_sb_handle
*sb
,
249 enum bch_sb_field_type type
,
252 struct bch_sb_field
*f
= bch2_sb_field_get_id(sb
->sb
, type
);
254 if (!f
|| le32_to_cpu(f
->u64s
) < u64s
)
255 f
= bch2_sb_field_resize_id(sb
, type
, u64s
);
259 /* Superblock validate: */
261 static int validate_sb_layout(struct bch_sb_layout
*layout
, struct printbuf
*out
)
263 u64 offset
, prev_offset
, max_sectors
;
266 BUILD_BUG_ON(sizeof(struct bch_sb_layout
) != 512);
268 if (!uuid_equal(&layout
->magic
, &BCACHE_MAGIC
) &&
269 !uuid_equal(&layout
->magic
, &BCHFS_MAGIC
)) {
270 prt_printf(out
, "Not a bcachefs superblock layout");
271 return -BCH_ERR_invalid_sb_layout
;
274 if (layout
->layout_type
!= 0) {
275 prt_printf(out
, "Invalid superblock layout type %u",
276 layout
->layout_type
);
277 return -BCH_ERR_invalid_sb_layout_type
;
280 if (!layout
->nr_superblocks
) {
281 prt_printf(out
, "Invalid superblock layout: no superblocks");
282 return -BCH_ERR_invalid_sb_layout_nr_superblocks
;
285 if (layout
->nr_superblocks
> ARRAY_SIZE(layout
->sb_offset
)) {
286 prt_printf(out
, "Invalid superblock layout: too many superblocks");
287 return -BCH_ERR_invalid_sb_layout_nr_superblocks
;
290 max_sectors
= 1 << layout
->sb_max_size_bits
;
292 prev_offset
= le64_to_cpu(layout
->sb_offset
[0]);
294 for (i
= 1; i
< layout
->nr_superblocks
; i
++) {
295 offset
= le64_to_cpu(layout
->sb_offset
[i
]);
297 if (offset
< prev_offset
+ max_sectors
) {
298 prt_printf(out
, "Invalid superblock layout: superblocks overlap\n"
299 " (sb %u ends at %llu next starts at %llu",
300 i
- 1, prev_offset
+ max_sectors
, offset
);
301 return -BCH_ERR_invalid_sb_layout_superblocks_overlap
;
303 prev_offset
= offset
;
309 static int bch2_sb_compatible(struct bch_sb
*sb
, struct printbuf
*out
)
311 u16 version
= le16_to_cpu(sb
->version
);
312 u16 version_min
= le16_to_cpu(sb
->version_min
);
314 if (!bch2_version_compatible(version
)) {
315 prt_str(out
, "Unsupported superblock version ");
316 bch2_version_to_text(out
, version
);
317 prt_str(out
, " (min ");
318 bch2_version_to_text(out
, bcachefs_metadata_version_min
);
319 prt_str(out
, ", max ");
320 bch2_version_to_text(out
, bcachefs_metadata_version_current
);
322 return -BCH_ERR_invalid_sb_version
;
325 if (!bch2_version_compatible(version_min
)) {
326 prt_str(out
, "Unsupported superblock version_min ");
327 bch2_version_to_text(out
, version_min
);
328 prt_str(out
, " (min ");
329 bch2_version_to_text(out
, bcachefs_metadata_version_min
);
330 prt_str(out
, ", max ");
331 bch2_version_to_text(out
, bcachefs_metadata_version_current
);
333 return -BCH_ERR_invalid_sb_version
;
336 if (version_min
> version
) {
337 prt_str(out
, "Bad minimum version ");
338 bch2_version_to_text(out
, version_min
);
339 prt_str(out
, ", greater than version field ");
340 bch2_version_to_text(out
, version
);
341 return -BCH_ERR_invalid_sb_version
;
347 static int bch2_sb_validate(struct bch_sb_handle
*disk_sb
, struct printbuf
*out
,
350 struct bch_sb
*sb
= disk_sb
->sb
;
351 struct bch_sb_field_members_v1
*mi
;
352 enum bch_opt_id opt_id
;
356 ret
= bch2_sb_compatible(sb
, out
);
360 if (sb
->features
[1] ||
361 (le64_to_cpu(sb
->features
[0]) & (~0ULL << BCH_FEATURE_NR
))) {
362 prt_printf(out
, "Filesystem has incompatible features");
363 return -BCH_ERR_invalid_sb_features
;
366 block_size
= le16_to_cpu(sb
->block_size
);
368 if (block_size
> PAGE_SECTORS
) {
369 prt_printf(out
, "Block size too big (got %u, max %u)",
370 block_size
, PAGE_SECTORS
);
371 return -BCH_ERR_invalid_sb_block_size
;
374 if (bch2_is_zero(sb
->user_uuid
.b
, sizeof(sb
->user_uuid
))) {
375 prt_printf(out
, "Bad user UUID (got zeroes)");
376 return -BCH_ERR_invalid_sb_uuid
;
379 if (bch2_is_zero(sb
->uuid
.b
, sizeof(sb
->uuid
))) {
380 prt_printf(out
, "Bad internal UUID (got zeroes)");
381 return -BCH_ERR_invalid_sb_uuid
;
384 if (!sb
->nr_devices
||
385 sb
->nr_devices
> BCH_SB_MEMBERS_MAX
) {
386 prt_printf(out
, "Bad number of member devices %u (max %u)",
387 sb
->nr_devices
, BCH_SB_MEMBERS_MAX
);
388 return -BCH_ERR_invalid_sb_too_many_members
;
391 if (sb
->dev_idx
>= sb
->nr_devices
) {
392 prt_printf(out
, "Bad dev_idx (got %u, nr_devices %u)",
393 sb
->dev_idx
, sb
->nr_devices
);
394 return -BCH_ERR_invalid_sb_dev_idx
;
397 if (!sb
->time_precision
||
398 le32_to_cpu(sb
->time_precision
) > NSEC_PER_SEC
) {
399 prt_printf(out
, "Invalid time precision: %u (min 1, max %lu)",
400 le32_to_cpu(sb
->time_precision
), NSEC_PER_SEC
);
401 return -BCH_ERR_invalid_sb_time_precision
;
406 * Been seeing a bug where these are getting inexplicably
407 * zeroed, so we're now validating them, but we have to be
408 * careful not to preven people's filesystems from mounting:
410 if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb
))
411 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb
, 1000);
412 if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb
))
413 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb
, 1000);
415 if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb
))
416 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb
, le16_to_cpu(sb
->version
));
419 for (opt_id
= 0; opt_id
< bch2_opts_nr
; opt_id
++) {
420 const struct bch_option
*opt
= bch2_opt_table
+ opt_id
;
422 if (opt
->get_sb
!= BCH2_NO_SB_OPT
) {
423 u64 v
= bch2_opt_from_sb(sb
, opt_id
);
425 prt_printf(out
, "Invalid option ");
426 ret
= bch2_opt_validate(opt
, v
, out
);
434 /* validate layout */
435 ret
= validate_sb_layout(&sb
->layout
, out
);
439 vstruct_for_each(sb
, f
) {
441 prt_printf(out
, "Invalid superblock: optional field with size 0 (type %u)",
442 le32_to_cpu(f
->type
));
443 return -BCH_ERR_invalid_sb_field_size
;
446 if (vstruct_next(f
) > vstruct_last(sb
)) {
447 prt_printf(out
, "Invalid superblock: optional field extends past end of superblock (type %u)",
448 le32_to_cpu(f
->type
));
449 return -BCH_ERR_invalid_sb_field_size
;
453 /* members must be validated first: */
454 mi
= bch2_sb_field_get(sb
, members_v1
);
456 prt_printf(out
, "Invalid superblock: member info area missing");
457 return -BCH_ERR_invalid_sb_members_missing
;
460 ret
= bch2_sb_field_validate(sb
, &mi
->field
, out
);
464 vstruct_for_each(sb
, f
) {
465 if (le32_to_cpu(f
->type
) == BCH_SB_FIELD_members_v1
)
468 ret
= bch2_sb_field_validate(sb
, f
, out
);
474 bch2_sb_member_get(sb
, sb
->dev_idx
).seq
!= sb
->seq
) {
475 prt_printf(out
, "Invalid superblock: member seq %llu != sb seq %llu",
476 le64_to_cpu(bch2_sb_member_get(sb
, sb
->dev_idx
).seq
),
477 le64_to_cpu(sb
->seq
));
478 return -BCH_ERR_invalid_sb_members_missing
;
486 static unsigned long le_ulong_to_cpu(unsigned long v
)
488 return sizeof(unsigned long) == 8
493 static void le_bitvector_to_cpu(unsigned long *dst
, unsigned long *src
, unsigned nr
)
495 BUG_ON(nr
& (BITS_PER_TYPE(long) - 1));
497 for (unsigned i
= 0; i
< BITS_TO_LONGS(nr
); i
++)
498 dst
[i
] = le_ulong_to_cpu(src
[i
]);
501 static void bch2_sb_update(struct bch_fs
*c
)
503 struct bch_sb
*src
= c
->disk_sb
.sb
;
505 lockdep_assert_held(&c
->sb_lock
);
507 c
->sb
.uuid
= src
->uuid
;
508 c
->sb
.user_uuid
= src
->user_uuid
;
509 c
->sb
.version
= le16_to_cpu(src
->version
);
510 c
->sb
.version_min
= le16_to_cpu(src
->version_min
);
511 c
->sb
.version_upgrade_complete
= BCH_SB_VERSION_UPGRADE_COMPLETE(src
);
512 c
->sb
.nr_devices
= src
->nr_devices
;
513 c
->sb
.clean
= BCH_SB_CLEAN(src
);
514 c
->sb
.encryption_type
= BCH_SB_ENCRYPTION_TYPE(src
);
516 c
->sb
.nsec_per_time_unit
= le32_to_cpu(src
->time_precision
);
517 c
->sb
.time_units_per_sec
= NSEC_PER_SEC
/ c
->sb
.nsec_per_time_unit
;
519 /* XXX this is wrong, we need a 96 or 128 bit integer type */
520 c
->sb
.time_base_lo
= div_u64(le64_to_cpu(src
->time_base_lo
),
521 c
->sb
.nsec_per_time_unit
);
522 c
->sb
.time_base_hi
= le32_to_cpu(src
->time_base_hi
);
524 c
->sb
.features
= le64_to_cpu(src
->features
[0]);
525 c
->sb
.compat
= le64_to_cpu(src
->compat
[0]);
527 memset(c
->sb
.errors_silent
, 0, sizeof(c
->sb
.errors_silent
));
529 struct bch_sb_field_ext
*ext
= bch2_sb_field_get(src
, ext
);
531 le_bitvector_to_cpu(c
->sb
.errors_silent
, (void *) ext
->errors_silent
,
532 sizeof(c
->sb
.errors_silent
) * 8);
534 for_each_member_device(c
, ca
) {
535 struct bch_member m
= bch2_sb_member_get(src
, ca
->dev_idx
);
536 ca
->mi
= bch2_mi_to_cpu(&m
);
540 static int __copy_super(struct bch_sb_handle
*dst_handle
, struct bch_sb
*src
)
542 struct bch_sb_field
*src_f
, *dst_f
;
543 struct bch_sb
*dst
= dst_handle
->sb
;
546 dst
->version
= src
->version
;
547 dst
->version_min
= src
->version_min
;
549 dst
->uuid
= src
->uuid
;
550 dst
->user_uuid
= src
->user_uuid
;
551 memcpy(dst
->label
, src
->label
, sizeof(dst
->label
));
553 dst
->block_size
= src
->block_size
;
554 dst
->nr_devices
= src
->nr_devices
;
556 dst
->time_base_lo
= src
->time_base_lo
;
557 dst
->time_base_hi
= src
->time_base_hi
;
558 dst
->time_precision
= src
->time_precision
;
559 dst
->write_time
= src
->write_time
;
561 memcpy(dst
->flags
, src
->flags
, sizeof(dst
->flags
));
562 memcpy(dst
->features
, src
->features
, sizeof(dst
->features
));
563 memcpy(dst
->compat
, src
->compat
, sizeof(dst
->compat
));
565 for (i
= 0; i
< BCH_SB_FIELD_NR
; i
++) {
568 if ((1U << i
) & BCH_SINGLE_DEVICE_SB_FIELDS
)
571 src_f
= bch2_sb_field_get_id(src
, i
);
572 dst_f
= bch2_sb_field_get_id(dst
, i
);
574 d
= (src_f
? le32_to_cpu(src_f
->u64s
) : 0) -
575 (dst_f
? le32_to_cpu(dst_f
->u64s
) : 0);
577 int ret
= bch2_sb_realloc(dst_handle
,
578 le32_to_cpu(dst_handle
->sb
->u64s
) + d
);
583 dst
= dst_handle
->sb
;
584 dst_f
= bch2_sb_field_get_id(dst
, i
);
587 dst_f
= __bch2_sb_field_resize(dst_handle
, dst_f
,
588 src_f
? le32_to_cpu(src_f
->u64s
) : 0);
591 memcpy(dst_f
, src_f
, vstruct_bytes(src_f
));
597 int bch2_sb_to_fs(struct bch_fs
*c
, struct bch_sb
*src
)
601 lockdep_assert_held(&c
->sb_lock
);
603 ret
= bch2_sb_realloc(&c
->disk_sb
, 0) ?:
604 __copy_super(&c
->disk_sb
, src
) ?:
605 bch2_sb_replicas_to_cpu_replicas(c
) ?:
606 bch2_sb_disk_groups_to_cpu(c
);
614 int bch2_sb_from_fs(struct bch_fs
*c
, struct bch_dev
*ca
)
616 return __copy_super(&ca
->disk_sb
, c
->disk_sb
.sb
);
619 /* read superblock: */
621 static int read_one_super(struct bch_sb_handle
*sb
, u64 offset
, struct printbuf
*err
)
626 bio_reset(sb
->bio
, sb
->bdev
, REQ_OP_READ
|REQ_SYNC
|REQ_META
);
627 sb
->bio
->bi_iter
.bi_sector
= offset
;
628 bch2_bio_map(sb
->bio
, sb
->sb
, sb
->buffer_size
);
630 ret
= submit_bio_wait(sb
->bio
);
632 prt_printf(err
, "IO error: %i", ret
);
636 if (!uuid_equal(&sb
->sb
->magic
, &BCACHE_MAGIC
) &&
637 !uuid_equal(&sb
->sb
->magic
, &BCHFS_MAGIC
)) {
638 prt_str(err
, "Not a bcachefs superblock (got magic ");
639 pr_uuid(err
, sb
->sb
->magic
.b
);
641 return -BCH_ERR_invalid_sb_magic
;
644 ret
= bch2_sb_compatible(sb
->sb
, err
);
648 bytes
= vstruct_bytes(sb
->sb
);
650 if (bytes
> 512 << sb
->sb
->layout
.sb_max_size_bits
) {
651 prt_printf(err
, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
652 bytes
, 512UL << sb
->sb
->layout
.sb_max_size_bits
);
653 return -BCH_ERR_invalid_sb_too_big
;
656 if (bytes
> sb
->buffer_size
) {
657 ret
= bch2_sb_realloc(sb
, le32_to_cpu(sb
->sb
->u64s
));
663 enum bch_csum_type csum_type
= BCH_SB_CSUM_TYPE(sb
->sb
);
664 if (csum_type
>= BCH_CSUM_NR
) {
665 prt_printf(err
, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb
->sb
));
666 return -BCH_ERR_invalid_sb_csum_type
;
669 /* XXX: verify MACs */
670 struct bch_csum csum
= csum_vstruct(NULL
, csum_type
, null_nonce(), sb
->sb
);
671 if (bch2_crc_cmp(csum
, sb
->sb
->csum
)) {
672 bch2_csum_err_msg(err
, csum_type
, sb
->sb
->csum
, csum
);
673 return -BCH_ERR_invalid_sb_csum
;
676 sb
->seq
= le64_to_cpu(sb
->sb
->seq
);
681 static int __bch2_read_super(const char *path
, struct bch_opts
*opts
,
682 struct bch_sb_handle
*sb
, bool ignore_notbchfs_msg
)
684 u64 offset
= opt_get(*opts
, sb
);
685 struct bch_sb_layout layout
;
686 struct printbuf err
= PRINTBUF
;
687 struct printbuf err2
= PRINTBUF
;
693 memset(sb
, 0, sizeof(*sb
));
694 sb
->mode
= BLK_OPEN_READ
;
696 sb
->holder
= kmalloc(1, GFP_KERNEL
);
700 sb
->sb_name
= kstrdup(path
, GFP_KERNEL
);
705 if (opt_get(*opts
, direct_io
) == false)
706 sb
->mode
|= BLK_OPEN_BUFFERED
;
709 if (!opt_get(*opts
, noexcl
))
710 sb
->mode
|= BLK_OPEN_EXCL
;
712 if (!opt_get(*opts
, nochanges
))
713 sb
->mode
|= BLK_OPEN_WRITE
;
715 sb
->s_bdev_file
= bdev_file_open_by_path(path
, sb
->mode
, sb
->holder
, &bch2_sb_handle_bdev_ops
);
716 if (IS_ERR(sb
->s_bdev_file
) &&
717 PTR_ERR(sb
->s_bdev_file
) == -EACCES
&&
718 opt_get(*opts
, read_only
)) {
719 sb
->mode
&= ~BLK_OPEN_WRITE
;
721 sb
->s_bdev_file
= bdev_file_open_by_path(path
, sb
->mode
, sb
->holder
, &bch2_sb_handle_bdev_ops
);
722 if (!IS_ERR(sb
->s_bdev_file
))
723 opt_set(*opts
, nochanges
, true);
726 if (IS_ERR(sb
->s_bdev_file
)) {
727 ret
= PTR_ERR(sb
->s_bdev_file
);
728 prt_printf(&err
, "error opening %s: %s", path
, bch2_err_str(ret
));
731 sb
->bdev
= file_bdev(sb
->s_bdev_file
);
733 ret
= bch2_sb_realloc(sb
, 0);
735 prt_printf(&err
, "error allocating memory for superblock");
739 if (bch2_fs_init_fault("read_super")) {
740 prt_printf(&err
, "dynamic fault");
745 ret
= read_one_super(sb
, offset
, &err
);
749 if (opt_defined(*opts
, sb
))
752 prt_printf(&err2
, "bcachefs (%s): error reading default superblock: %s\n",
754 if (ret
== -BCH_ERR_invalid_sb_magic
&& ignore_notbchfs_msg
)
755 bch2_print_opts(opts
, KERN_INFO
"%s", err2
.buf
);
757 bch2_print_opts(opts
, KERN_ERR
"%s", err2
.buf
);
759 printbuf_exit(&err2
);
760 printbuf_reset(&err
);
763 * Error reading primary superblock - read location of backup
766 bio_reset(sb
->bio
, sb
->bdev
, REQ_OP_READ
|REQ_SYNC
|REQ_META
);
767 sb
->bio
->bi_iter
.bi_sector
= BCH_SB_LAYOUT_SECTOR
;
769 * use sb buffer to read layout, since sb buffer is page aligned but
772 bch2_bio_map(sb
->bio
, sb
->sb
, sizeof(struct bch_sb_layout
));
774 ret
= submit_bio_wait(sb
->bio
);
776 prt_printf(&err
, "IO error: %i", ret
);
780 memcpy(&layout
, sb
->sb
, sizeof(layout
));
781 ret
= validate_sb_layout(&layout
, &err
);
785 for (i
= layout
.sb_offset
;
786 i
< layout
.sb_offset
+ layout
.nr_superblocks
; i
++) {
787 offset
= le64_to_cpu(*i
);
789 if (offset
== opt_get(*opts
, sb
))
792 ret
= read_one_super(sb
, offset
, &err
);
800 if (le16_to_cpu(sb
->sb
->block_size
) << 9 <
801 bdev_logical_block_size(sb
->bdev
) &&
802 opt_get(*opts
, direct_io
)) {
804 opt_set(*opts
, direct_io
, false);
808 prt_printf(&err
, "block size (%u) smaller than device block size (%u)",
809 le16_to_cpu(sb
->sb
->block_size
) << 9,
810 bdev_logical_block_size(sb
->bdev
));
811 ret
= -BCH_ERR_block_size_too_small
;
815 sb
->have_layout
= true;
817 ret
= bch2_sb_validate(sb
, &err
, READ
);
819 bch2_print_opts(opts
, KERN_ERR
"bcachefs (%s): error validating superblock: %s\n",
827 bch2_print_opts(opts
, KERN_ERR
"bcachefs (%s): error reading superblock: %s\n",
834 int bch2_read_super(const char *path
, struct bch_opts
*opts
,
835 struct bch_sb_handle
*sb
)
837 return __bch2_read_super(path
, opts
, sb
, false);
840 /* provide a silenced version for mount.bcachefs */
842 int bch2_read_super_silent(const char *path
, struct bch_opts
*opts
,
843 struct bch_sb_handle
*sb
)
845 return __bch2_read_super(path
, opts
, sb
, true);
848 /* write superblock: */
850 static void write_super_endio(struct bio
*bio
)
852 struct bch_dev
*ca
= bio
->bi_private
;
854 /* XXX: return errors directly */
856 if (bch2_dev_io_err_on(bio
->bi_status
, ca
,
858 ? BCH_MEMBER_ERROR_write
859 : BCH_MEMBER_ERROR_read
,
860 "superblock %s error: %s",
861 bio_data_dir(bio
) ? "write" : "read",
862 bch2_blk_status_to_str(bio
->bi_status
)))
863 ca
->sb_write_error
= 1;
865 closure_put(&ca
->fs
->sb_write
);
866 percpu_ref_put(&ca
->io_ref
);
869 static void read_back_super(struct bch_fs
*c
, struct bch_dev
*ca
)
871 struct bch_sb
*sb
= ca
->disk_sb
.sb
;
872 struct bio
*bio
= ca
->disk_sb
.bio
;
874 bio_reset(bio
, ca
->disk_sb
.bdev
, REQ_OP_READ
|REQ_SYNC
|REQ_META
);
875 bio
->bi_iter
.bi_sector
= le64_to_cpu(sb
->layout
.sb_offset
[0]);
876 bio
->bi_end_io
= write_super_endio
;
877 bio
->bi_private
= ca
;
878 bch2_bio_map(bio
, ca
->sb_read_scratch
, PAGE_SIZE
);
880 this_cpu_add(ca
->io_done
->sectors
[READ
][BCH_DATA_sb
],
883 percpu_ref_get(&ca
->io_ref
);
884 closure_bio_submit(bio
, &c
->sb_write
);
887 static void write_one_super(struct bch_fs
*c
, struct bch_dev
*ca
, unsigned idx
)
889 struct bch_sb
*sb
= ca
->disk_sb
.sb
;
890 struct bio
*bio
= ca
->disk_sb
.bio
;
892 sb
->offset
= sb
->layout
.sb_offset
[idx
];
894 SET_BCH_SB_CSUM_TYPE(sb
, bch2_csum_opt_to_type(c
->opts
.metadata_checksum
, false));
895 sb
->csum
= csum_vstruct(c
, BCH_SB_CSUM_TYPE(sb
),
898 bio_reset(bio
, ca
->disk_sb
.bdev
, REQ_OP_WRITE
|REQ_SYNC
|REQ_META
);
899 bio
->bi_iter
.bi_sector
= le64_to_cpu(sb
->offset
);
900 bio
->bi_end_io
= write_super_endio
;
901 bio
->bi_private
= ca
;
902 bch2_bio_map(bio
, sb
,
903 roundup((size_t) vstruct_bytes(sb
),
904 bdev_logical_block_size(ca
->disk_sb
.bdev
)));
906 this_cpu_add(ca
->io_done
->sectors
[WRITE
][BCH_DATA_sb
],
909 percpu_ref_get(&ca
->io_ref
);
910 closure_bio_submit(bio
, &c
->sb_write
);
913 int bch2_write_super(struct bch_fs
*c
)
915 struct closure
*cl
= &c
->sb_write
;
916 struct printbuf err
= PRINTBUF
;
917 unsigned sb
= 0, nr_wrote
;
918 struct bch_devs_mask sb_written
;
919 bool wrote
, can_mount_without_written
, can_mount_with_written
;
920 unsigned degraded_flags
= BCH_FORCE_IF_DEGRADED
;
923 trace_and_count(c
, write_super
, c
, _RET_IP_
);
925 if (c
->opts
.very_degraded
)
926 degraded_flags
|= BCH_FORCE_IF_LOST
;
928 lockdep_assert_held(&c
->sb_lock
);
930 closure_init_stack(cl
);
931 memset(&sb_written
, 0, sizeof(sb_written
));
933 /* Make sure we're using the new magic numbers: */
934 c
->disk_sb
.sb
->magic
= BCHFS_MAGIC
;
935 c
->disk_sb
.sb
->layout
.magic
= BCHFS_MAGIC
;
937 le64_add_cpu(&c
->disk_sb
.sb
->seq
, 1);
939 struct bch_sb_field_members_v2
*mi
= bch2_sb_field_get(c
->disk_sb
.sb
, members_v2
);
940 for_each_online_member(c
, ca
)
941 __bch2_members_v2_get_mut(mi
, ca
->dev_idx
)->seq
= c
->disk_sb
.sb
->seq
;
942 c
->disk_sb
.sb
->write_time
= cpu_to_le64(ktime_get_real_seconds());
944 if (test_bit(BCH_FS_error
, &c
->flags
))
945 SET_BCH_SB_HAS_ERRORS(c
->disk_sb
.sb
, 1);
946 if (test_bit(BCH_FS_topology_error
, &c
->flags
))
947 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c
->disk_sb
.sb
, 1);
949 SET_BCH_SB_BIG_ENDIAN(c
->disk_sb
.sb
, CPU_BIG_ENDIAN
);
951 bch2_sb_counters_from_cpu(c
);
952 bch2_sb_members_from_cpu(c
);
953 bch2_sb_members_cpy_v2_v1(&c
->disk_sb
);
954 bch2_sb_errors_from_cpu(c
);
955 bch2_sb_downgrade_update(c
);
957 for_each_online_member(c
, ca
)
958 bch2_sb_from_fs(c
, ca
);
960 for_each_online_member(c
, ca
) {
961 printbuf_reset(&err
);
963 ret
= bch2_sb_validate(&ca
->disk_sb
, &err
, WRITE
);
965 bch2_fs_inconsistent(c
, "sb invalid before write: %s", err
.buf
);
966 percpu_ref_put(&ca
->io_ref
);
971 if (c
->opts
.nochanges
)
975 * Defer writing the superblock until filesystem initialization is
976 * complete - don't write out a partly initialized superblock:
978 if (!BCH_SB_INITIALIZED(c
->disk_sb
.sb
))
981 if (le16_to_cpu(c
->disk_sb
.sb
->version
) > bcachefs_metadata_version_current
) {
982 struct printbuf buf
= PRINTBUF
;
983 prt_printf(&buf
, "attempting to write superblock that wasn't version downgraded (");
984 bch2_version_to_text(&buf
, le16_to_cpu(c
->disk_sb
.sb
->version
));
985 prt_str(&buf
, " > ");
986 bch2_version_to_text(&buf
, bcachefs_metadata_version_current
);
988 bch2_fs_fatal_error(c
, ": %s", buf
.buf
);
990 return -BCH_ERR_sb_not_downgraded
;
993 for_each_online_member(c
, ca
) {
994 __set_bit(ca
->dev_idx
, sb_written
.d
);
995 ca
->sb_write_error
= 0;
998 for_each_online_member(c
, ca
)
999 read_back_super(c
, ca
);
1002 for_each_online_member(c
, ca
) {
1003 if (ca
->sb_write_error
)
1006 if (le64_to_cpu(ca
->sb_read_scratch
->seq
) < ca
->disk_sb
.seq
) {
1007 bch2_fs_fatal_error(c
,
1008 ": Superblock write was silently dropped! (seq %llu expected %llu)",
1009 le64_to_cpu(ca
->sb_read_scratch
->seq
),
1011 percpu_ref_put(&ca
->io_ref
);
1012 ret
= -BCH_ERR_erofs_sb_err
;
1016 if (le64_to_cpu(ca
->sb_read_scratch
->seq
) > ca
->disk_sb
.seq
) {
1017 bch2_fs_fatal_error(c
,
1018 ": Superblock modified by another process (seq %llu expected %llu)",
1019 le64_to_cpu(ca
->sb_read_scratch
->seq
),
1021 percpu_ref_put(&ca
->io_ref
);
1022 ret
= -BCH_ERR_erofs_sb_err
;
1029 for_each_online_member(c
, ca
)
1030 if (!ca
->sb_write_error
&&
1031 sb
< ca
->disk_sb
.sb
->layout
.nr_superblocks
) {
1032 write_one_super(c
, ca
, sb
);
1039 for_each_online_member(c
, ca
) {
1040 if (ca
->sb_write_error
)
1041 __clear_bit(ca
->dev_idx
, sb_written
.d
);
1043 ca
->disk_sb
.seq
= le64_to_cpu(ca
->disk_sb
.sb
->seq
);
1046 nr_wrote
= dev_mask_nr(&sb_written
);
1048 can_mount_with_written
=
1049 bch2_have_enough_devs(c
, sb_written
, degraded_flags
, false);
1051 for (unsigned i
= 0; i
< ARRAY_SIZE(sb_written
.d
); i
++)
1052 sb_written
.d
[i
] = ~sb_written
.d
[i
];
1054 can_mount_without_written
=
1055 bch2_have_enough_devs(c
, sb_written
, degraded_flags
, false);
1058 * If we would be able to mount _without_ the devices we successfully
1059 * wrote superblocks to, we weren't able to write to enough devices:
1061 * Exception: if we can mount without the successes because we haven't
1062 * written anything (new filesystem), we continue if we'd be able to
1063 * mount with the devices we did successfully write to:
1065 if (bch2_fs_fatal_err_on(!nr_wrote
||
1066 !can_mount_with_written
||
1067 (can_mount_without_written
&&
1068 !can_mount_with_written
), c
,
1069 ": Unable to write superblock to sufficient devices (from %ps)",
1073 /* Make new options visible after they're persistent: */
1075 printbuf_exit(&err
);
1079 void __bch2_check_set_feature(struct bch_fs
*c
, unsigned feat
)
1081 mutex_lock(&c
->sb_lock
);
1082 if (!(c
->sb
.features
& (1ULL << feat
))) {
1083 c
->disk_sb
.sb
->features
[0] |= cpu_to_le64(1ULL << feat
);
1085 bch2_write_super(c
);
1087 mutex_unlock(&c
->sb_lock
);
1090 /* Downgrade if superblock is at a higher version than currently supported: */
1091 bool bch2_check_version_downgrade(struct bch_fs
*c
)
1093 bool ret
= bcachefs_metadata_version_current
< c
->sb
.version
;
1095 lockdep_assert_held(&c
->sb_lock
);
1098 * Downgrade, if superblock is at a higher version than currently
1101 * c->sb will be checked before we write the superblock, so update it as
1104 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c
->disk_sb
.sb
) > bcachefs_metadata_version_current
) {
1105 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c
->disk_sb
.sb
, bcachefs_metadata_version_current
);
1106 c
->sb
.version_upgrade_complete
= bcachefs_metadata_version_current
;
1108 if (c
->sb
.version
> bcachefs_metadata_version_current
) {
1109 c
->disk_sb
.sb
->version
= cpu_to_le16(bcachefs_metadata_version_current
);
1110 c
->sb
.version
= bcachefs_metadata_version_current
;
1112 if (c
->sb
.version_min
> bcachefs_metadata_version_current
) {
1113 c
->disk_sb
.sb
->version_min
= cpu_to_le16(bcachefs_metadata_version_current
);
1114 c
->sb
.version_min
= bcachefs_metadata_version_current
;
1116 c
->disk_sb
.sb
->compat
[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR
) - 1);
1120 void bch2_sb_upgrade(struct bch_fs
*c
, unsigned new_version
)
1122 lockdep_assert_held(&c
->sb_lock
);
1124 if (BCH_VERSION_MAJOR(new_version
) >
1125 BCH_VERSION_MAJOR(le16_to_cpu(c
->disk_sb
.sb
->version
)))
1126 bch2_sb_field_resize(&c
->disk_sb
, downgrade
, 0);
1128 c
->disk_sb
.sb
->version
= cpu_to_le16(new_version
);
1129 c
->disk_sb
.sb
->features
[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL
);
1132 static int bch2_sb_ext_validate(struct bch_sb
*sb
, struct bch_sb_field
*f
,
1133 struct printbuf
*err
)
1135 if (vstruct_bytes(f
) < 88) {
1136 prt_printf(err
, "field too small (%zu < %u)", vstruct_bytes(f
), 88);
1137 return -BCH_ERR_invalid_sb_ext
;
1143 static void bch2_sb_ext_to_text(struct printbuf
*out
, struct bch_sb
*sb
,
1144 struct bch_sb_field
*f
)
1146 struct bch_sb_field_ext
*e
= field_to_type(f
, ext
);
1148 prt_printf(out
, "Recovery passes required:");
1150 prt_bitflags(out
, bch2_recovery_passes
,
1151 bch2_recovery_passes_from_stable(le64_to_cpu(e
->recovery_passes_required
[0])));
1154 unsigned long *errors_silent
= kmalloc(sizeof(e
->errors_silent
), GFP_KERNEL
);
1155 if (errors_silent
) {
1156 le_bitvector_to_cpu(errors_silent
, (void *) e
->errors_silent
, sizeof(e
->errors_silent
) * 8);
1158 prt_printf(out
, "Errors to silently fix:");
1160 prt_bitflags_vector(out
, bch2_sb_error_strs
, errors_silent
, sizeof(e
->errors_silent
) * 8);
1163 kfree(errors_silent
);
1167 static const struct bch_sb_field_ops bch_sb_field_ops_ext
= {
1168 .validate
= bch2_sb_ext_validate
,
1169 .to_text
= bch2_sb_ext_to_text
,
1172 static const struct bch_sb_field_ops
*bch2_sb_field_ops
[] = {
1174 [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
1179 static const struct bch_sb_field_ops bch2_sb_field_null_ops
;
1181 static const struct bch_sb_field_ops
*bch2_sb_field_type_ops(unsigned type
)
1183 return likely(type
< ARRAY_SIZE(bch2_sb_field_ops
))
1184 ? bch2_sb_field_ops
[type
]
1185 : &bch2_sb_field_null_ops
;
1188 static int bch2_sb_field_validate(struct bch_sb
*sb
, struct bch_sb_field
*f
,
1189 struct printbuf
*err
)
1191 unsigned type
= le32_to_cpu(f
->type
);
1192 struct printbuf field_err
= PRINTBUF
;
1193 const struct bch_sb_field_ops
*ops
= bch2_sb_field_type_ops(type
);
1196 ret
= ops
->validate
? ops
->validate(sb
, f
, &field_err
) : 0;
1198 prt_printf(err
, "Invalid superblock section %s: %s",
1199 bch2_sb_fields
[type
], field_err
.buf
);
1201 bch2_sb_field_to_text(err
, sb
, f
);
1204 printbuf_exit(&field_err
);
1208 void __bch2_sb_field_to_text(struct printbuf
*out
, struct bch_sb
*sb
,
1209 struct bch_sb_field
*f
)
1211 unsigned type
= le32_to_cpu(f
->type
);
1212 const struct bch_sb_field_ops
*ops
= bch2_sb_field_type_ops(type
);
1214 if (!out
->nr_tabstops
)
1215 printbuf_tabstop_push(out
, 32);
1218 ops
->to_text(out
, sb
, f
);
1221 void bch2_sb_field_to_text(struct printbuf
*out
, struct bch_sb
*sb
,
1222 struct bch_sb_field
*f
)
1224 unsigned type
= le32_to_cpu(f
->type
);
1226 if (type
< BCH_SB_FIELD_NR
)
1227 prt_printf(out
, "%s", bch2_sb_fields
[type
]);
1229 prt_printf(out
, "(unknown field %u)", type
);
1231 prt_printf(out
, " (size %zu):", vstruct_bytes(f
));
1234 __bch2_sb_field_to_text(out
, sb
, f
);
1237 void bch2_sb_layout_to_text(struct printbuf
*out
, struct bch_sb_layout
*l
)
1241 prt_printf(out
, "Type: %u", l
->layout_type
);
1244 prt_str(out
, "Superblock max size: ");
1245 prt_units_u64(out
, 512 << l
->sb_max_size_bits
);
1248 prt_printf(out
, "Nr superblocks: %u", l
->nr_superblocks
);
1251 prt_str(out
, "Offsets: ");
1252 for (i
= 0; i
< l
->nr_superblocks
; i
++) {
1255 prt_printf(out
, "%llu", le64_to_cpu(l
->sb_offset
[i
]));
1260 void bch2_sb_to_text(struct printbuf
*out
, struct bch_sb
*sb
,
1261 bool print_layout
, unsigned fields
)
1263 u64 fields_have
= 0;
1264 unsigned nr_devices
= 0;
1266 if (!out
->nr_tabstops
)
1267 printbuf_tabstop_push(out
, 44);
1269 for (int i
= 0; i
< sb
->nr_devices
; i
++)
1270 nr_devices
+= bch2_dev_exists(sb
, i
);
1272 prt_printf(out
, "External UUID:");
1274 pr_uuid(out
, sb
->user_uuid
.b
);
1277 prt_printf(out
, "Internal UUID:");
1279 pr_uuid(out
, sb
->uuid
.b
);
1282 prt_printf(out
, "Magic number:");
1284 pr_uuid(out
, sb
->magic
.b
);
1287 prt_str(out
, "Device index:");
1289 prt_printf(out
, "%u", sb
->dev_idx
);
1292 prt_str(out
, "Label:");
1294 prt_printf(out
, "%.*s", (int) sizeof(sb
->label
), sb
->label
);
1297 prt_str(out
, "Version:");
1299 bch2_version_to_text(out
, le16_to_cpu(sb
->version
));
1302 prt_str(out
, "Version upgrade complete:");
1304 bch2_version_to_text(out
, BCH_SB_VERSION_UPGRADE_COMPLETE(sb
));
1307 prt_printf(out
, "Oldest version on disk:");
1309 bch2_version_to_text(out
, le16_to_cpu(sb
->version_min
));
1312 prt_printf(out
, "Created:");
1314 if (sb
->time_base_lo
)
1315 bch2_prt_datetime(out
, div_u64(le64_to_cpu(sb
->time_base_lo
), NSEC_PER_SEC
));
1317 prt_printf(out
, "(not set)");
1320 prt_printf(out
, "Sequence number:");
1322 prt_printf(out
, "%llu", le64_to_cpu(sb
->seq
));
1325 prt_printf(out
, "Time of last write:");
1327 bch2_prt_datetime(out
, le64_to_cpu(sb
->write_time
));
1330 prt_printf(out
, "Superblock size:");
1332 prt_units_u64(out
, vstruct_bytes(sb
));
1334 prt_units_u64(out
, 512ULL << sb
->layout
.sb_max_size_bits
);
1337 prt_printf(out
, "Clean:");
1339 prt_printf(out
, "%llu", BCH_SB_CLEAN(sb
));
1342 prt_printf(out
, "Devices:");
1344 prt_printf(out
, "%u", nr_devices
);
1347 prt_printf(out
, "Sections:");
1348 vstruct_for_each(sb
, f
)
1349 fields_have
|= 1 << le32_to_cpu(f
->type
);
1351 prt_bitflags(out
, bch2_sb_fields
, fields_have
);
1354 prt_printf(out
, "Features:");
1356 prt_bitflags(out
, bch2_sb_features
, le64_to_cpu(sb
->features
[0]));
1359 prt_printf(out
, "Compat features:");
1361 prt_bitflags(out
, bch2_sb_compat
, le64_to_cpu(sb
->compat
[0]));
1365 prt_printf(out
, "Options:");
1367 printbuf_indent_add(out
, 2);
1371 for (id
= 0; id
< bch2_opts_nr
; id
++) {
1372 const struct bch_option
*opt
= bch2_opt_table
+ id
;
1374 if (opt
->get_sb
!= BCH2_NO_SB_OPT
) {
1375 u64 v
= bch2_opt_from_sb(sb
, id
);
1377 prt_printf(out
, "%s:", opt
->attr
.name
);
1379 bch2_opt_to_text(out
, NULL
, sb
, opt
, v
,
1380 OPT_HUMAN_READABLE
|OPT_SHOW_FULL_LIST
);
1386 printbuf_indent_sub(out
, 2);
1390 prt_printf(out
, "layout:");
1392 printbuf_indent_add(out
, 2);
1393 bch2_sb_layout_to_text(out
, &sb
->layout
);
1394 printbuf_indent_sub(out
, 2);
1397 vstruct_for_each(sb
, f
)
1398 if (fields
& (1 << le32_to_cpu(f
->type
))) {
1400 bch2_sb_field_to_text(out
, sb
, f
);