1 // SPDX-License-Identifier: GPL-2.0
5 #include "disk_groups.h"
9 #include "journal_sb.h"
10 #include "journal_seq_blacklist.h"
15 #include "sb-counters.h"
16 #include "sb-downgrade.h"
17 #include "sb-errors.h"
18 #include "sb-members.h"
24 #include <linux/backing-dev.h>
25 #include <linux/sort.h>
27 static const struct blk_holder_ops bch2_sb_handle_bdev_ops
= {
30 struct bch2_metadata_version
{
35 static const struct bch2_metadata_version bch2_metadata_versions
[] = {
40 BCH_METADATA_VERSIONS()
44 void bch2_version_to_text(struct printbuf
*out
, unsigned v
)
46 const char *str
= "(unknown version)";
48 for (unsigned i
= 0; i
< ARRAY_SIZE(bch2_metadata_versions
); i
++)
49 if (bch2_metadata_versions
[i
].version
== v
) {
50 str
= bch2_metadata_versions
[i
].name
;
54 prt_printf(out
, "%u.%u: %s", BCH_VERSION_MAJOR(v
), BCH_VERSION_MINOR(v
), str
);
57 unsigned bch2_latest_compatible_version(unsigned v
)
59 if (!BCH_VERSION_MAJOR(v
))
62 for (unsigned i
= 0; i
< ARRAY_SIZE(bch2_metadata_versions
); i
++)
63 if (bch2_metadata_versions
[i
].version
> v
&&
64 BCH_VERSION_MAJOR(bch2_metadata_versions
[i
].version
) ==
66 v
= bch2_metadata_versions
[i
].version
;
71 const char * const bch2_sb_fields
[] = {
72 #define x(name, nr) #name,
78 static int bch2_sb_field_validate(struct bch_sb
*, struct bch_sb_field
*,
81 struct bch_sb_field
*bch2_sb_field_get_id(struct bch_sb
*sb
,
82 enum bch_sb_field_type type
)
84 /* XXX: need locking around superblock to access optional fields */
86 vstruct_for_each(sb
, f
)
87 if (le32_to_cpu(f
->type
) == type
)
92 static struct bch_sb_field
*__bch2_sb_field_resize(struct bch_sb_handle
*sb
,
93 struct bch_sb_field
*f
,
96 unsigned old_u64s
= f
? le32_to_cpu(f
->u64s
) : 0;
97 unsigned sb_u64s
= le32_to_cpu(sb
->sb
->u64s
) + u64s
- old_u64s
;
99 BUG_ON(__vstruct_bytes(struct bch_sb
, sb_u64s
) > sb
->buffer_size
);
104 f
= vstruct_last(sb
->sb
);
105 memset(f
, 0, sizeof(u64
) * u64s
);
106 f
->u64s
= cpu_to_le32(u64s
);
111 src
= vstruct_end(f
);
114 f
->u64s
= cpu_to_le32(u64s
);
115 dst
= vstruct_end(f
);
120 memmove(dst
, src
, vstruct_end(sb
->sb
) - src
);
123 memset(src
, 0, dst
- src
);
126 sb
->sb
->u64s
= cpu_to_le32(sb_u64s
);
128 return u64s
? f
: NULL
;
131 void bch2_sb_field_delete(struct bch_sb_handle
*sb
,
132 enum bch_sb_field_type type
)
134 struct bch_sb_field
*f
= bch2_sb_field_get_id(sb
->sb
, type
);
137 __bch2_sb_field_resize(sb
, f
, 0);
140 /* Superblock realloc/free: */
142 void bch2_free_super(struct bch_sb_handle
*sb
)
145 if (!IS_ERR_OR_NULL(sb
->s_bdev_file
))
146 fput(sb
->s_bdev_file
);
151 memset(sb
, 0, sizeof(*sb
));
154 int bch2_sb_realloc(struct bch_sb_handle
*sb
, unsigned u64s
)
156 size_t new_bytes
= __vstruct_bytes(struct bch_sb
, u64s
);
157 size_t new_buffer_size
;
158 struct bch_sb
*new_sb
;
162 new_bytes
= max_t(size_t, new_bytes
, bdev_logical_block_size(sb
->bdev
));
164 new_buffer_size
= roundup_pow_of_two(new_bytes
);
166 if (sb
->sb
&& sb
->buffer_size
>= new_buffer_size
)
169 if (sb
->sb
&& sb
->have_layout
) {
170 u64 max_bytes
= 512 << sb
->sb
->layout
.sb_max_size_bits
;
172 if (new_bytes
> max_bytes
) {
173 struct printbuf buf
= PRINTBUF
;
175 prt_bdevname(&buf
, sb
->bdev
);
176 prt_printf(&buf
, ": superblock too big: want %zu but have %llu", new_bytes
, max_bytes
);
177 pr_err("%s", buf
.buf
);
179 return -BCH_ERR_ENOSPC_sb
;
183 if (sb
->buffer_size
>= new_buffer_size
&& sb
->sb
)
186 if (dynamic_fault("bcachefs:add:super_realloc"))
187 return -BCH_ERR_ENOMEM_sb_realloc_injected
;
189 new_sb
= krealloc(sb
->sb
, new_buffer_size
, GFP_NOFS
|__GFP_ZERO
);
191 return -BCH_ERR_ENOMEM_sb_buf_realloc
;
196 unsigned nr_bvecs
= buf_pages(sb
->sb
, new_buffer_size
);
198 bio
= bio_kmalloc(nr_bvecs
, GFP_KERNEL
);
200 return -BCH_ERR_ENOMEM_sb_bio_realloc
;
202 bio_init(bio
, NULL
, bio
->bi_inline_vecs
, nr_bvecs
, 0);
208 sb
->buffer_size
= new_buffer_size
;
213 struct bch_sb_field
*bch2_sb_field_resize_id(struct bch_sb_handle
*sb
,
214 enum bch_sb_field_type type
,
217 struct bch_sb_field
*f
= bch2_sb_field_get_id(sb
->sb
, type
);
218 ssize_t old_u64s
= f
? le32_to_cpu(f
->u64s
) : 0;
219 ssize_t d
= -old_u64s
+ u64s
;
221 if (bch2_sb_realloc(sb
, le32_to_cpu(sb
->sb
->u64s
) + d
))
225 struct bch_fs
*c
= container_of(sb
, struct bch_fs
, disk_sb
);
227 lockdep_assert_held(&c
->sb_lock
);
229 /* XXX: we're not checking that offline device have enough space */
231 for_each_online_member(c
, ca
) {
232 struct bch_sb_handle
*dev_sb
= &ca
->disk_sb
;
234 if (bch2_sb_realloc(dev_sb
, le32_to_cpu(dev_sb
->sb
->u64s
) + d
)) {
235 percpu_ref_put(&ca
->ref
);
241 f
= bch2_sb_field_get_id(sb
->sb
, type
);
242 f
= __bch2_sb_field_resize(sb
, f
, u64s
);
244 f
->type
= cpu_to_le32(type
);
248 struct bch_sb_field
*bch2_sb_field_get_minsize_id(struct bch_sb_handle
*sb
,
249 enum bch_sb_field_type type
,
252 struct bch_sb_field
*f
= bch2_sb_field_get_id(sb
->sb
, type
);
254 if (!f
|| le32_to_cpu(f
->u64s
) < u64s
)
255 f
= bch2_sb_field_resize_id(sb
, type
, u64s
);
259 /* Superblock validate: */
261 static int validate_sb_layout(struct bch_sb_layout
*layout
, struct printbuf
*out
)
263 u64 offset
, prev_offset
, max_sectors
;
266 BUILD_BUG_ON(sizeof(struct bch_sb_layout
) != 512);
268 if (!uuid_equal(&layout
->magic
, &BCACHE_MAGIC
) &&
269 !uuid_equal(&layout
->magic
, &BCHFS_MAGIC
)) {
270 prt_printf(out
, "Not a bcachefs superblock layout");
271 return -BCH_ERR_invalid_sb_layout
;
274 if (layout
->layout_type
!= 0) {
275 prt_printf(out
, "Invalid superblock layout type %u",
276 layout
->layout_type
);
277 return -BCH_ERR_invalid_sb_layout_type
;
280 if (!layout
->nr_superblocks
) {
281 prt_printf(out
, "Invalid superblock layout: no superblocks");
282 return -BCH_ERR_invalid_sb_layout_nr_superblocks
;
285 if (layout
->nr_superblocks
> ARRAY_SIZE(layout
->sb_offset
)) {
286 prt_printf(out
, "Invalid superblock layout: too many superblocks");
287 return -BCH_ERR_invalid_sb_layout_nr_superblocks
;
290 max_sectors
= 1 << layout
->sb_max_size_bits
;
292 prev_offset
= le64_to_cpu(layout
->sb_offset
[0]);
294 for (i
= 1; i
< layout
->nr_superblocks
; i
++) {
295 offset
= le64_to_cpu(layout
->sb_offset
[i
]);
297 if (offset
< prev_offset
+ max_sectors
) {
298 prt_printf(out
, "Invalid superblock layout: superblocks overlap\n"
299 " (sb %u ends at %llu next starts at %llu",
300 i
- 1, prev_offset
+ max_sectors
, offset
);
301 return -BCH_ERR_invalid_sb_layout_superblocks_overlap
;
303 prev_offset
= offset
;
309 static int bch2_sb_compatible(struct bch_sb
*sb
, struct printbuf
*out
)
311 u16 version
= le16_to_cpu(sb
->version
);
312 u16 version_min
= le16_to_cpu(sb
->version_min
);
314 if (!bch2_version_compatible(version
)) {
315 prt_str(out
, "Unsupported superblock version ");
316 bch2_version_to_text(out
, version
);
317 prt_str(out
, " (min ");
318 bch2_version_to_text(out
, bcachefs_metadata_version_min
);
319 prt_str(out
, ", max ");
320 bch2_version_to_text(out
, bcachefs_metadata_version_current
);
322 return -BCH_ERR_invalid_sb_version
;
325 if (!bch2_version_compatible(version_min
)) {
326 prt_str(out
, "Unsupported superblock version_min ");
327 bch2_version_to_text(out
, version_min
);
328 prt_str(out
, " (min ");
329 bch2_version_to_text(out
, bcachefs_metadata_version_min
);
330 prt_str(out
, ", max ");
331 bch2_version_to_text(out
, bcachefs_metadata_version_current
);
333 return -BCH_ERR_invalid_sb_version
;
336 if (version_min
> version
) {
337 prt_str(out
, "Bad minimum version ");
338 bch2_version_to_text(out
, version_min
);
339 prt_str(out
, ", greater than version field ");
340 bch2_version_to_text(out
, version
);
341 return -BCH_ERR_invalid_sb_version
;
347 static int bch2_sb_validate(struct bch_sb_handle
*disk_sb
, struct printbuf
*out
,
350 struct bch_sb
*sb
= disk_sb
->sb
;
351 struct bch_sb_field_members_v1
*mi
;
352 enum bch_opt_id opt_id
;
356 ret
= bch2_sb_compatible(sb
, out
);
360 if (sb
->features
[1] ||
361 (le64_to_cpu(sb
->features
[0]) & (~0ULL << BCH_FEATURE_NR
))) {
362 prt_printf(out
, "Filesystem has incompatible features");
363 return -BCH_ERR_invalid_sb_features
;
366 block_size
= le16_to_cpu(sb
->block_size
);
368 if (block_size
> PAGE_SECTORS
) {
369 prt_printf(out
, "Block size too big (got %u, max %u)",
370 block_size
, PAGE_SECTORS
);
371 return -BCH_ERR_invalid_sb_block_size
;
374 if (bch2_is_zero(sb
->user_uuid
.b
, sizeof(sb
->user_uuid
))) {
375 prt_printf(out
, "Bad user UUID (got zeroes)");
376 return -BCH_ERR_invalid_sb_uuid
;
379 if (bch2_is_zero(sb
->uuid
.b
, sizeof(sb
->uuid
))) {
380 prt_printf(out
, "Bad internal UUID (got zeroes)");
381 return -BCH_ERR_invalid_sb_uuid
;
384 if (!sb
->nr_devices
||
385 sb
->nr_devices
> BCH_SB_MEMBERS_MAX
) {
386 prt_printf(out
, "Bad number of member devices %u (max %u)",
387 sb
->nr_devices
, BCH_SB_MEMBERS_MAX
);
388 return -BCH_ERR_invalid_sb_too_many_members
;
391 if (sb
->dev_idx
>= sb
->nr_devices
) {
392 prt_printf(out
, "Bad dev_idx (got %u, nr_devices %u)",
393 sb
->dev_idx
, sb
->nr_devices
);
394 return -BCH_ERR_invalid_sb_dev_idx
;
397 if (!sb
->time_precision
||
398 le32_to_cpu(sb
->time_precision
) > NSEC_PER_SEC
) {
399 prt_printf(out
, "Invalid time precision: %u (min 1, max %lu)",
400 le32_to_cpu(sb
->time_precision
), NSEC_PER_SEC
);
401 return -BCH_ERR_invalid_sb_time_precision
;
406 * Been seeing a bug where these are getting inexplicably
407 * zeroed, so we're now validating them, but we have to be
408 * careful not to preven people's filesystems from mounting:
410 if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb
))
411 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb
, 1000);
412 if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb
))
413 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb
, 1000);
415 if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb
))
416 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb
, le16_to_cpu(sb
->version
));
419 for (opt_id
= 0; opt_id
< bch2_opts_nr
; opt_id
++) {
420 const struct bch_option
*opt
= bch2_opt_table
+ opt_id
;
422 if (opt
->get_sb
!= BCH2_NO_SB_OPT
) {
423 u64 v
= bch2_opt_from_sb(sb
, opt_id
);
425 prt_printf(out
, "Invalid option ");
426 ret
= bch2_opt_validate(opt
, v
, out
);
434 /* validate layout */
435 ret
= validate_sb_layout(&sb
->layout
, out
);
439 vstruct_for_each(sb
, f
) {
441 prt_printf(out
, "Invalid superblock: optional field with size 0 (type %u)",
442 le32_to_cpu(f
->type
));
443 return -BCH_ERR_invalid_sb_field_size
;
446 if (vstruct_next(f
) > vstruct_last(sb
)) {
447 prt_printf(out
, "Invalid superblock: optional field extends past end of superblock (type %u)",
448 le32_to_cpu(f
->type
));
449 return -BCH_ERR_invalid_sb_field_size
;
453 /* members must be validated first: */
454 mi
= bch2_sb_field_get(sb
, members_v1
);
456 prt_printf(out
, "Invalid superblock: member info area missing");
457 return -BCH_ERR_invalid_sb_members_missing
;
460 ret
= bch2_sb_field_validate(sb
, &mi
->field
, out
);
464 vstruct_for_each(sb
, f
) {
465 if (le32_to_cpu(f
->type
) == BCH_SB_FIELD_members_v1
)
468 ret
= bch2_sb_field_validate(sb
, f
, out
);
478 static unsigned long le_ulong_to_cpu(unsigned long v
)
480 return sizeof(unsigned long) == 8
485 static void le_bitvector_to_cpu(unsigned long *dst
, unsigned long *src
, unsigned nr
)
487 BUG_ON(nr
& (BITS_PER_TYPE(long) - 1));
489 for (unsigned i
= 0; i
< BITS_TO_LONGS(nr
); i
++)
490 dst
[i
] = le_ulong_to_cpu(src
[i
]);
493 static void bch2_sb_update(struct bch_fs
*c
)
495 struct bch_sb
*src
= c
->disk_sb
.sb
;
497 lockdep_assert_held(&c
->sb_lock
);
499 c
->sb
.uuid
= src
->uuid
;
500 c
->sb
.user_uuid
= src
->user_uuid
;
501 c
->sb
.version
= le16_to_cpu(src
->version
);
502 c
->sb
.version_min
= le16_to_cpu(src
->version_min
);
503 c
->sb
.version_upgrade_complete
= BCH_SB_VERSION_UPGRADE_COMPLETE(src
);
504 c
->sb
.nr_devices
= src
->nr_devices
;
505 c
->sb
.clean
= BCH_SB_CLEAN(src
);
506 c
->sb
.encryption_type
= BCH_SB_ENCRYPTION_TYPE(src
);
508 c
->sb
.nsec_per_time_unit
= le32_to_cpu(src
->time_precision
);
509 c
->sb
.time_units_per_sec
= NSEC_PER_SEC
/ c
->sb
.nsec_per_time_unit
;
511 /* XXX this is wrong, we need a 96 or 128 bit integer type */
512 c
->sb
.time_base_lo
= div_u64(le64_to_cpu(src
->time_base_lo
),
513 c
->sb
.nsec_per_time_unit
);
514 c
->sb
.time_base_hi
= le32_to_cpu(src
->time_base_hi
);
516 c
->sb
.features
= le64_to_cpu(src
->features
[0]);
517 c
->sb
.compat
= le64_to_cpu(src
->compat
[0]);
519 memset(c
->sb
.errors_silent
, 0, sizeof(c
->sb
.errors_silent
));
521 struct bch_sb_field_ext
*ext
= bch2_sb_field_get(src
, ext
);
523 le_bitvector_to_cpu(c
->sb
.errors_silent
, (void *) ext
->errors_silent
,
524 sizeof(c
->sb
.errors_silent
) * 8);
526 for_each_member_device(c
, ca
) {
527 struct bch_member m
= bch2_sb_member_get(src
, ca
->dev_idx
);
528 ca
->mi
= bch2_mi_to_cpu(&m
);
532 static int __copy_super(struct bch_sb_handle
*dst_handle
, struct bch_sb
*src
)
534 struct bch_sb_field
*src_f
, *dst_f
;
535 struct bch_sb
*dst
= dst_handle
->sb
;
538 dst
->version
= src
->version
;
539 dst
->version_min
= src
->version_min
;
541 dst
->uuid
= src
->uuid
;
542 dst
->user_uuid
= src
->user_uuid
;
543 memcpy(dst
->label
, src
->label
, sizeof(dst
->label
));
545 dst
->block_size
= src
->block_size
;
546 dst
->nr_devices
= src
->nr_devices
;
548 dst
->time_base_lo
= src
->time_base_lo
;
549 dst
->time_base_hi
= src
->time_base_hi
;
550 dst
->time_precision
= src
->time_precision
;
551 dst
->write_time
= src
->write_time
;
553 memcpy(dst
->flags
, src
->flags
, sizeof(dst
->flags
));
554 memcpy(dst
->features
, src
->features
, sizeof(dst
->features
));
555 memcpy(dst
->compat
, src
->compat
, sizeof(dst
->compat
));
557 for (i
= 0; i
< BCH_SB_FIELD_NR
; i
++) {
560 if ((1U << i
) & BCH_SINGLE_DEVICE_SB_FIELDS
)
563 src_f
= bch2_sb_field_get_id(src
, i
);
564 dst_f
= bch2_sb_field_get_id(dst
, i
);
566 d
= (src_f
? le32_to_cpu(src_f
->u64s
) : 0) -
567 (dst_f
? le32_to_cpu(dst_f
->u64s
) : 0);
569 int ret
= bch2_sb_realloc(dst_handle
,
570 le32_to_cpu(dst_handle
->sb
->u64s
) + d
);
575 dst
= dst_handle
->sb
;
576 dst_f
= bch2_sb_field_get_id(dst
, i
);
579 dst_f
= __bch2_sb_field_resize(dst_handle
, dst_f
,
580 src_f
? le32_to_cpu(src_f
->u64s
) : 0);
583 memcpy(dst_f
, src_f
, vstruct_bytes(src_f
));
589 int bch2_sb_to_fs(struct bch_fs
*c
, struct bch_sb
*src
)
593 lockdep_assert_held(&c
->sb_lock
);
595 ret
= bch2_sb_realloc(&c
->disk_sb
, 0) ?:
596 __copy_super(&c
->disk_sb
, src
) ?:
597 bch2_sb_replicas_to_cpu_replicas(c
) ?:
598 bch2_sb_disk_groups_to_cpu(c
);
606 int bch2_sb_from_fs(struct bch_fs
*c
, struct bch_dev
*ca
)
608 return __copy_super(&ca
->disk_sb
, c
->disk_sb
.sb
);
611 /* read superblock: */
613 static int read_one_super(struct bch_sb_handle
*sb
, u64 offset
, struct printbuf
*err
)
618 bio_reset(sb
->bio
, sb
->bdev
, REQ_OP_READ
|REQ_SYNC
|REQ_META
);
619 sb
->bio
->bi_iter
.bi_sector
= offset
;
620 bch2_bio_map(sb
->bio
, sb
->sb
, sb
->buffer_size
);
622 ret
= submit_bio_wait(sb
->bio
);
624 prt_printf(err
, "IO error: %i", ret
);
628 if (!uuid_equal(&sb
->sb
->magic
, &BCACHE_MAGIC
) &&
629 !uuid_equal(&sb
->sb
->magic
, &BCHFS_MAGIC
)) {
630 prt_str(err
, "Not a bcachefs superblock (got magic ");
631 pr_uuid(err
, sb
->sb
->magic
.b
);
633 return -BCH_ERR_invalid_sb_magic
;
636 ret
= bch2_sb_compatible(sb
->sb
, err
);
640 bytes
= vstruct_bytes(sb
->sb
);
642 if (bytes
> 512 << sb
->sb
->layout
.sb_max_size_bits
) {
643 prt_printf(err
, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
644 bytes
, 512UL << sb
->sb
->layout
.sb_max_size_bits
);
645 return -BCH_ERR_invalid_sb_too_big
;
648 if (bytes
> sb
->buffer_size
) {
649 ret
= bch2_sb_realloc(sb
, le32_to_cpu(sb
->sb
->u64s
));
655 enum bch_csum_type csum_type
= BCH_SB_CSUM_TYPE(sb
->sb
);
656 if (csum_type
>= BCH_CSUM_NR
) {
657 prt_printf(err
, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb
->sb
));
658 return -BCH_ERR_invalid_sb_csum_type
;
661 /* XXX: verify MACs */
662 struct bch_csum csum
= csum_vstruct(NULL
, csum_type
, null_nonce(), sb
->sb
);
663 if (bch2_crc_cmp(csum
, sb
->sb
->csum
)) {
664 bch2_csum_err_msg(err
, csum_type
, sb
->sb
->csum
, csum
);
665 return -BCH_ERR_invalid_sb_csum
;
668 sb
->seq
= le64_to_cpu(sb
->sb
->seq
);
673 static int __bch2_read_super(const char *path
, struct bch_opts
*opts
,
674 struct bch_sb_handle
*sb
, bool ignore_notbchfs_msg
)
676 u64 offset
= opt_get(*opts
, sb
);
677 struct bch_sb_layout layout
;
678 struct printbuf err
= PRINTBUF
;
679 struct printbuf err2
= PRINTBUF
;
685 memset(sb
, 0, sizeof(*sb
));
686 sb
->mode
= BLK_OPEN_READ
;
688 sb
->holder
= kmalloc(1, GFP_KERNEL
);
692 sb
->sb_name
= kstrdup(path
, GFP_KERNEL
);
697 if (opt_get(*opts
, direct_io
) == false)
698 sb
->mode
|= BLK_OPEN_BUFFERED
;
701 if (!opt_get(*opts
, noexcl
))
702 sb
->mode
|= BLK_OPEN_EXCL
;
704 if (!opt_get(*opts
, nochanges
))
705 sb
->mode
|= BLK_OPEN_WRITE
;
707 sb
->s_bdev_file
= bdev_file_open_by_path(path
, sb
->mode
, sb
->holder
, &bch2_sb_handle_bdev_ops
);
708 if (IS_ERR(sb
->s_bdev_file
) &&
709 PTR_ERR(sb
->s_bdev_file
) == -EACCES
&&
710 opt_get(*opts
, read_only
)) {
711 sb
->mode
&= ~BLK_OPEN_WRITE
;
713 sb
->s_bdev_file
= bdev_file_open_by_path(path
, sb
->mode
, sb
->holder
, &bch2_sb_handle_bdev_ops
);
714 if (!IS_ERR(sb
->s_bdev_file
))
715 opt_set(*opts
, nochanges
, true);
718 if (IS_ERR(sb
->s_bdev_file
)) {
719 ret
= PTR_ERR(sb
->s_bdev_file
);
722 sb
->bdev
= file_bdev(sb
->s_bdev_file
);
724 ret
= bch2_sb_realloc(sb
, 0);
726 prt_printf(&err
, "error allocating memory for superblock");
730 if (bch2_fs_init_fault("read_super")) {
731 prt_printf(&err
, "dynamic fault");
736 ret
= read_one_super(sb
, offset
, &err
);
740 if (opt_defined(*opts
, sb
))
743 prt_printf(&err2
, "bcachefs (%s): error reading default superblock: %s\n",
745 if (ret
== -BCH_ERR_invalid_sb_magic
&& ignore_notbchfs_msg
)
746 printk(KERN_INFO
"%s", err2
.buf
);
748 printk(KERN_ERR
"%s", err2
.buf
);
750 printbuf_exit(&err2
);
751 printbuf_reset(&err
);
754 * Error reading primary superblock - read location of backup
757 bio_reset(sb
->bio
, sb
->bdev
, REQ_OP_READ
|REQ_SYNC
|REQ_META
);
758 sb
->bio
->bi_iter
.bi_sector
= BCH_SB_LAYOUT_SECTOR
;
760 * use sb buffer to read layout, since sb buffer is page aligned but
763 bch2_bio_map(sb
->bio
, sb
->sb
, sizeof(struct bch_sb_layout
));
765 ret
= submit_bio_wait(sb
->bio
);
767 prt_printf(&err
, "IO error: %i", ret
);
771 memcpy(&layout
, sb
->sb
, sizeof(layout
));
772 ret
= validate_sb_layout(&layout
, &err
);
776 for (i
= layout
.sb_offset
;
777 i
< layout
.sb_offset
+ layout
.nr_superblocks
; i
++) {
778 offset
= le64_to_cpu(*i
);
780 if (offset
== opt_get(*opts
, sb
))
783 ret
= read_one_super(sb
, offset
, &err
);
791 if (le16_to_cpu(sb
->sb
->block_size
) << 9 <
792 bdev_logical_block_size(sb
->bdev
) &&
793 opt_get(*opts
, direct_io
)) {
795 opt_set(*opts
, direct_io
, false);
799 prt_printf(&err
, "block size (%u) smaller than device block size (%u)",
800 le16_to_cpu(sb
->sb
->block_size
) << 9,
801 bdev_logical_block_size(sb
->bdev
));
802 ret
= -BCH_ERR_block_size_too_small
;
807 sb
->have_layout
= true;
809 ret
= bch2_sb_validate(sb
, &err
, READ
);
811 printk(KERN_ERR
"bcachefs (%s): error validating superblock: %s\n",
819 printk(KERN_ERR
"bcachefs (%s): error reading superblock: %s\n",
826 int bch2_read_super(const char *path
, struct bch_opts
*opts
,
827 struct bch_sb_handle
*sb
)
829 return __bch2_read_super(path
, opts
, sb
, false);
832 /* provide a silenced version for mount.bcachefs */
834 int bch2_read_super_silent(const char *path
, struct bch_opts
*opts
,
835 struct bch_sb_handle
*sb
)
837 return __bch2_read_super(path
, opts
, sb
, true);
840 /* write superblock: */
842 static void write_super_endio(struct bio
*bio
)
844 struct bch_dev
*ca
= bio
->bi_private
;
846 /* XXX: return errors directly */
848 if (bch2_dev_io_err_on(bio
->bi_status
, ca
,
850 ? BCH_MEMBER_ERROR_write
851 : BCH_MEMBER_ERROR_read
,
852 "superblock %s error: %s",
853 bio_data_dir(bio
) ? "write" : "read",
854 bch2_blk_status_to_str(bio
->bi_status
)))
855 ca
->sb_write_error
= 1;
857 closure_put(&ca
->fs
->sb_write
);
858 percpu_ref_put(&ca
->io_ref
);
861 static void read_back_super(struct bch_fs
*c
, struct bch_dev
*ca
)
863 struct bch_sb
*sb
= ca
->disk_sb
.sb
;
864 struct bio
*bio
= ca
->disk_sb
.bio
;
866 bio_reset(bio
, ca
->disk_sb
.bdev
, REQ_OP_READ
|REQ_SYNC
|REQ_META
);
867 bio
->bi_iter
.bi_sector
= le64_to_cpu(sb
->layout
.sb_offset
[0]);
868 bio
->bi_end_io
= write_super_endio
;
869 bio
->bi_private
= ca
;
870 bch2_bio_map(bio
, ca
->sb_read_scratch
, PAGE_SIZE
);
872 this_cpu_add(ca
->io_done
->sectors
[READ
][BCH_DATA_sb
],
875 percpu_ref_get(&ca
->io_ref
);
876 closure_bio_submit(bio
, &c
->sb_write
);
879 static void write_one_super(struct bch_fs
*c
, struct bch_dev
*ca
, unsigned idx
)
881 struct bch_sb
*sb
= ca
->disk_sb
.sb
;
882 struct bio
*bio
= ca
->disk_sb
.bio
;
884 sb
->offset
= sb
->layout
.sb_offset
[idx
];
886 SET_BCH_SB_CSUM_TYPE(sb
, bch2_csum_opt_to_type(c
->opts
.metadata_checksum
, false));
887 sb
->csum
= csum_vstruct(c
, BCH_SB_CSUM_TYPE(sb
),
890 bio_reset(bio
, ca
->disk_sb
.bdev
, REQ_OP_WRITE
|REQ_SYNC
|REQ_META
);
891 bio
->bi_iter
.bi_sector
= le64_to_cpu(sb
->offset
);
892 bio
->bi_end_io
= write_super_endio
;
893 bio
->bi_private
= ca
;
894 bch2_bio_map(bio
, sb
,
895 roundup((size_t) vstruct_bytes(sb
),
896 bdev_logical_block_size(ca
->disk_sb
.bdev
)));
898 this_cpu_add(ca
->io_done
->sectors
[WRITE
][BCH_DATA_sb
],
901 percpu_ref_get(&ca
->io_ref
);
902 closure_bio_submit(bio
, &c
->sb_write
);
905 int bch2_write_super(struct bch_fs
*c
)
907 struct closure
*cl
= &c
->sb_write
;
908 struct printbuf err
= PRINTBUF
;
909 unsigned sb
= 0, nr_wrote
;
910 struct bch_devs_mask sb_written
;
911 bool wrote
, can_mount_without_written
, can_mount_with_written
;
912 unsigned degraded_flags
= BCH_FORCE_IF_DEGRADED
;
915 trace_and_count(c
, write_super
, c
, _RET_IP_
);
917 if (c
->opts
.very_degraded
)
918 degraded_flags
|= BCH_FORCE_IF_LOST
;
920 lockdep_assert_held(&c
->sb_lock
);
922 closure_init_stack(cl
);
923 memset(&sb_written
, 0, sizeof(sb_written
));
925 /* Make sure we're using the new magic numbers: */
926 c
->disk_sb
.sb
->magic
= BCHFS_MAGIC
;
927 c
->disk_sb
.sb
->layout
.magic
= BCHFS_MAGIC
;
929 le64_add_cpu(&c
->disk_sb
.sb
->seq
, 1);
931 struct bch_sb_field_members_v2
*mi
= bch2_sb_field_get(c
->disk_sb
.sb
, members_v2
);
932 for_each_online_member(c
, ca
)
933 __bch2_members_v2_get_mut(mi
, ca
->dev_idx
)->seq
= c
->disk_sb
.sb
->seq
;
934 c
->disk_sb
.sb
->write_time
= cpu_to_le64(ktime_get_real_seconds());
936 if (test_bit(BCH_FS_error
, &c
->flags
))
937 SET_BCH_SB_HAS_ERRORS(c
->disk_sb
.sb
, 1);
938 if (test_bit(BCH_FS_topology_error
, &c
->flags
))
939 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c
->disk_sb
.sb
, 1);
941 SET_BCH_SB_BIG_ENDIAN(c
->disk_sb
.sb
, CPU_BIG_ENDIAN
);
943 bch2_sb_counters_from_cpu(c
);
944 bch2_sb_members_from_cpu(c
);
945 bch2_sb_members_cpy_v2_v1(&c
->disk_sb
);
946 bch2_sb_errors_from_cpu(c
);
947 bch2_sb_downgrade_update(c
);
949 for_each_online_member(c
, ca
)
950 bch2_sb_from_fs(c
, ca
);
952 for_each_online_member(c
, ca
) {
953 printbuf_reset(&err
);
955 ret
= bch2_sb_validate(&ca
->disk_sb
, &err
, WRITE
);
957 bch2_fs_inconsistent(c
, "sb invalid before write: %s", err
.buf
);
958 percpu_ref_put(&ca
->io_ref
);
963 if (c
->opts
.nochanges
)
967 * Defer writing the superblock until filesystem initialization is
968 * complete - don't write out a partly initialized superblock:
970 if (!BCH_SB_INITIALIZED(c
->disk_sb
.sb
))
973 if (le16_to_cpu(c
->disk_sb
.sb
->version
) > bcachefs_metadata_version_current
) {
974 struct printbuf buf
= PRINTBUF
;
975 prt_printf(&buf
, "attempting to write superblock that wasn't version downgraded (");
976 bch2_version_to_text(&buf
, le16_to_cpu(c
->disk_sb
.sb
->version
));
977 prt_str(&buf
, " > ");
978 bch2_version_to_text(&buf
, bcachefs_metadata_version_current
);
980 bch2_fs_fatal_error(c
, "%s", buf
.buf
);
982 return -BCH_ERR_sb_not_downgraded
;
985 for_each_online_member(c
, ca
) {
986 __set_bit(ca
->dev_idx
, sb_written
.d
);
987 ca
->sb_write_error
= 0;
990 for_each_online_member(c
, ca
)
991 read_back_super(c
, ca
);
994 for_each_online_member(c
, ca
) {
995 if (ca
->sb_write_error
)
998 if (le64_to_cpu(ca
->sb_read_scratch
->seq
) < ca
->disk_sb
.seq
) {
999 bch2_fs_fatal_error(c
,
1000 "Superblock write was silently dropped! (seq %llu expected %llu)",
1001 le64_to_cpu(ca
->sb_read_scratch
->seq
),
1003 percpu_ref_put(&ca
->io_ref
);
1004 ret
= -BCH_ERR_erofs_sb_err
;
1008 if (le64_to_cpu(ca
->sb_read_scratch
->seq
) > ca
->disk_sb
.seq
) {
1009 bch2_fs_fatal_error(c
,
1010 "Superblock modified by another process (seq %llu expected %llu)",
1011 le64_to_cpu(ca
->sb_read_scratch
->seq
),
1013 percpu_ref_put(&ca
->io_ref
);
1014 ret
= -BCH_ERR_erofs_sb_err
;
1021 for_each_online_member(c
, ca
)
1022 if (!ca
->sb_write_error
&&
1023 sb
< ca
->disk_sb
.sb
->layout
.nr_superblocks
) {
1024 write_one_super(c
, ca
, sb
);
1031 for_each_online_member(c
, ca
) {
1032 if (ca
->sb_write_error
)
1033 __clear_bit(ca
->dev_idx
, sb_written
.d
);
1035 ca
->disk_sb
.seq
= le64_to_cpu(ca
->disk_sb
.sb
->seq
);
1038 nr_wrote
= dev_mask_nr(&sb_written
);
1040 can_mount_with_written
=
1041 bch2_have_enough_devs(c
, sb_written
, degraded_flags
, false);
1043 for (unsigned i
= 0; i
< ARRAY_SIZE(sb_written
.d
); i
++)
1044 sb_written
.d
[i
] = ~sb_written
.d
[i
];
1046 can_mount_without_written
=
1047 bch2_have_enough_devs(c
, sb_written
, degraded_flags
, false);
1050 * If we would be able to mount _without_ the devices we successfully
1051 * wrote superblocks to, we weren't able to write to enough devices:
1053 * Exception: if we can mount without the successes because we haven't
1054 * written anything (new filesystem), we continue if we'd be able to
1055 * mount with the devices we did successfully write to:
1057 if (bch2_fs_fatal_err_on(!nr_wrote
||
1058 !can_mount_with_written
||
1059 (can_mount_without_written
&&
1060 !can_mount_with_written
), c
,
1061 "Unable to write superblock to sufficient devices (from %ps)",
1065 /* Make new options visible after they're persistent: */
1067 printbuf_exit(&err
);
1071 void __bch2_check_set_feature(struct bch_fs
*c
, unsigned feat
)
1073 mutex_lock(&c
->sb_lock
);
1074 if (!(c
->sb
.features
& (1ULL << feat
))) {
1075 c
->disk_sb
.sb
->features
[0] |= cpu_to_le64(1ULL << feat
);
1077 bch2_write_super(c
);
1079 mutex_unlock(&c
->sb_lock
);
1082 /* Downgrade if superblock is at a higher version than currently supported: */
1083 bool bch2_check_version_downgrade(struct bch_fs
*c
)
1085 bool ret
= bcachefs_metadata_version_current
< c
->sb
.version
;
1087 lockdep_assert_held(&c
->sb_lock
);
1090 * Downgrade, if superblock is at a higher version than currently
1093 * c->sb will be checked before we write the superblock, so update it as
1096 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c
->disk_sb
.sb
) > bcachefs_metadata_version_current
) {
1097 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c
->disk_sb
.sb
, bcachefs_metadata_version_current
);
1098 c
->sb
.version_upgrade_complete
= bcachefs_metadata_version_current
;
1100 if (c
->sb
.version
> bcachefs_metadata_version_current
) {
1101 c
->disk_sb
.sb
->version
= cpu_to_le16(bcachefs_metadata_version_current
);
1102 c
->sb
.version
= bcachefs_metadata_version_current
;
1104 if (c
->sb
.version_min
> bcachefs_metadata_version_current
) {
1105 c
->disk_sb
.sb
->version_min
= cpu_to_le16(bcachefs_metadata_version_current
);
1106 c
->sb
.version_min
= bcachefs_metadata_version_current
;
1108 c
->disk_sb
.sb
->compat
[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR
) - 1);
1112 void bch2_sb_upgrade(struct bch_fs
*c
, unsigned new_version
)
1114 lockdep_assert_held(&c
->sb_lock
);
1116 if (BCH_VERSION_MAJOR(new_version
) >
1117 BCH_VERSION_MAJOR(le16_to_cpu(c
->disk_sb
.sb
->version
)))
1118 bch2_sb_field_resize(&c
->disk_sb
, downgrade
, 0);
1120 c
->disk_sb
.sb
->version
= cpu_to_le16(new_version
);
1121 c
->disk_sb
.sb
->features
[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL
);
1124 static int bch2_sb_ext_validate(struct bch_sb
*sb
, struct bch_sb_field
*f
,
1125 struct printbuf
*err
)
1127 if (vstruct_bytes(f
) < 88) {
1128 prt_printf(err
, "field too small (%zu < %u)", vstruct_bytes(f
), 88);
1129 return -BCH_ERR_invalid_sb_ext
;
1135 static void bch2_sb_ext_to_text(struct printbuf
*out
, struct bch_sb
*sb
,
1136 struct bch_sb_field
*f
)
1138 struct bch_sb_field_ext
*e
= field_to_type(f
, ext
);
1140 prt_printf(out
, "Recovery passes required:");
1142 prt_bitflags(out
, bch2_recovery_passes
,
1143 bch2_recovery_passes_from_stable(le64_to_cpu(e
->recovery_passes_required
[0])));
1146 unsigned long *errors_silent
= kmalloc(sizeof(e
->errors_silent
), GFP_KERNEL
);
1147 if (errors_silent
) {
1148 le_bitvector_to_cpu(errors_silent
, (void *) e
->errors_silent
, sizeof(e
->errors_silent
) * 8);
1150 prt_printf(out
, "Errors to silently fix:");
1152 prt_bitflags_vector(out
, bch2_sb_error_strs
, errors_silent
, sizeof(e
->errors_silent
) * 8);
1155 kfree(errors_silent
);
1159 static const struct bch_sb_field_ops bch_sb_field_ops_ext
= {
1160 .validate
= bch2_sb_ext_validate
,
1161 .to_text
= bch2_sb_ext_to_text
,
1164 static const struct bch_sb_field_ops
*bch2_sb_field_ops
[] = {
1166 [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
1171 static const struct bch_sb_field_ops bch2_sb_field_null_ops
;
1173 static const struct bch_sb_field_ops
*bch2_sb_field_type_ops(unsigned type
)
1175 return likely(type
< ARRAY_SIZE(bch2_sb_field_ops
))
1176 ? bch2_sb_field_ops
[type
]
1177 : &bch2_sb_field_null_ops
;
1180 static int bch2_sb_field_validate(struct bch_sb
*sb
, struct bch_sb_field
*f
,
1181 struct printbuf
*err
)
1183 unsigned type
= le32_to_cpu(f
->type
);
1184 struct printbuf field_err
= PRINTBUF
;
1185 const struct bch_sb_field_ops
*ops
= bch2_sb_field_type_ops(type
);
1188 ret
= ops
->validate
? ops
->validate(sb
, f
, &field_err
) : 0;
1190 prt_printf(err
, "Invalid superblock section %s: %s",
1191 bch2_sb_fields
[type
], field_err
.buf
);
1193 bch2_sb_field_to_text(err
, sb
, f
);
1196 printbuf_exit(&field_err
);
1200 void __bch2_sb_field_to_text(struct printbuf
*out
, struct bch_sb
*sb
,
1201 struct bch_sb_field
*f
)
1203 unsigned type
= le32_to_cpu(f
->type
);
1204 const struct bch_sb_field_ops
*ops
= bch2_sb_field_type_ops(type
);
1206 if (!out
->nr_tabstops
)
1207 printbuf_tabstop_push(out
, 32);
1210 ops
->to_text(out
, sb
, f
);
1213 void bch2_sb_field_to_text(struct printbuf
*out
, struct bch_sb
*sb
,
1214 struct bch_sb_field
*f
)
1216 unsigned type
= le32_to_cpu(f
->type
);
1218 if (type
< BCH_SB_FIELD_NR
)
1219 prt_printf(out
, "%s", bch2_sb_fields
[type
]);
1221 prt_printf(out
, "(unknown field %u)", type
);
1223 prt_printf(out
, " (size %zu):", vstruct_bytes(f
));
1226 __bch2_sb_field_to_text(out
, sb
, f
);
1229 void bch2_sb_layout_to_text(struct printbuf
*out
, struct bch_sb_layout
*l
)
1233 prt_printf(out
, "Type: %u", l
->layout_type
);
1236 prt_str(out
, "Superblock max size: ");
1237 prt_units_u64(out
, 512 << l
->sb_max_size_bits
);
1240 prt_printf(out
, "Nr superblocks: %u", l
->nr_superblocks
);
1243 prt_str(out
, "Offsets: ");
1244 for (i
= 0; i
< l
->nr_superblocks
; i
++) {
1247 prt_printf(out
, "%llu", le64_to_cpu(l
->sb_offset
[i
]));
1252 void bch2_sb_to_text(struct printbuf
*out
, struct bch_sb
*sb
,
1253 bool print_layout
, unsigned fields
)
1255 u64 fields_have
= 0;
1256 unsigned nr_devices
= 0;
1258 if (!out
->nr_tabstops
)
1259 printbuf_tabstop_push(out
, 44);
1261 for (int i
= 0; i
< sb
->nr_devices
; i
++)
1262 nr_devices
+= bch2_dev_exists(sb
, i
);
1264 prt_printf(out
, "External UUID:");
1266 pr_uuid(out
, sb
->user_uuid
.b
);
1269 prt_printf(out
, "Internal UUID:");
1271 pr_uuid(out
, sb
->uuid
.b
);
1274 prt_printf(out
, "Magic number:");
1276 pr_uuid(out
, sb
->magic
.b
);
1279 prt_str(out
, "Device index:");
1281 prt_printf(out
, "%u", sb
->dev_idx
);
1284 prt_str(out
, "Label:");
1286 prt_printf(out
, "%.*s", (int) sizeof(sb
->label
), sb
->label
);
1289 prt_str(out
, "Version:");
1291 bch2_version_to_text(out
, le16_to_cpu(sb
->version
));
1294 prt_str(out
, "Version upgrade complete:");
1296 bch2_version_to_text(out
, BCH_SB_VERSION_UPGRADE_COMPLETE(sb
));
1299 prt_printf(out
, "Oldest version on disk:");
1301 bch2_version_to_text(out
, le16_to_cpu(sb
->version_min
));
1304 prt_printf(out
, "Created:");
1306 if (sb
->time_base_lo
)
1307 bch2_prt_datetime(out
, div_u64(le64_to_cpu(sb
->time_base_lo
), NSEC_PER_SEC
));
1309 prt_printf(out
, "(not set)");
1312 prt_printf(out
, "Sequence number:");
1314 prt_printf(out
, "%llu", le64_to_cpu(sb
->seq
));
1317 prt_printf(out
, "Time of last write:");
1319 bch2_prt_datetime(out
, le64_to_cpu(sb
->write_time
));
1322 prt_printf(out
, "Superblock size:");
1324 prt_units_u64(out
, vstruct_bytes(sb
));
1326 prt_units_u64(out
, 512ULL << sb
->layout
.sb_max_size_bits
);
1329 prt_printf(out
, "Clean:");
1331 prt_printf(out
, "%llu", BCH_SB_CLEAN(sb
));
1334 prt_printf(out
, "Devices:");
1336 prt_printf(out
, "%u", nr_devices
);
1339 prt_printf(out
, "Sections:");
1340 vstruct_for_each(sb
, f
)
1341 fields_have
|= 1 << le32_to_cpu(f
->type
);
1343 prt_bitflags(out
, bch2_sb_fields
, fields_have
);
1346 prt_printf(out
, "Features:");
1348 prt_bitflags(out
, bch2_sb_features
, le64_to_cpu(sb
->features
[0]));
1351 prt_printf(out
, "Compat features:");
1353 prt_bitflags(out
, bch2_sb_compat
, le64_to_cpu(sb
->compat
[0]));
1357 prt_printf(out
, "Options:");
1359 printbuf_indent_add(out
, 2);
1363 for (id
= 0; id
< bch2_opts_nr
; id
++) {
1364 const struct bch_option
*opt
= bch2_opt_table
+ id
;
1366 if (opt
->get_sb
!= BCH2_NO_SB_OPT
) {
1367 u64 v
= bch2_opt_from_sb(sb
, id
);
1369 prt_printf(out
, "%s:", opt
->attr
.name
);
1371 bch2_opt_to_text(out
, NULL
, sb
, opt
, v
,
1372 OPT_HUMAN_READABLE
|OPT_SHOW_FULL_LIST
);
1378 printbuf_indent_sub(out
, 2);
1382 prt_printf(out
, "layout:");
1384 printbuf_indent_add(out
, 2);
1385 bch2_sb_layout_to_text(out
, &sb
->layout
);
1386 printbuf_indent_sub(out
, 2);
1389 vstruct_for_each(sb
, f
)
1390 if (fields
& (1 << le32_to_cpu(f
->type
))) {
1392 bch2_sb_field_to_text(out
, sb
, f
);