1 // SPDX-License-Identifier: GPL-2.0
3 * bcachefs setup/teardown code, and some metadata io - read a superblock and
4 * figure out what to do with it.
6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7 * Copyright 2012 Google, Inc.
11 #include "alloc_background.h"
12 #include "alloc_foreground.h"
13 #include "bkey_sort.h"
14 #include "btree_cache.h"
16 #include "btree_journal_iter.h"
17 #include "btree_key_cache.h"
18 #include "btree_update_interior.h"
20 #include "btree_write_buffer.h"
21 #include "buckets_waiting_for_journal.h"
27 #include "disk_groups.h"
33 #include "fs-io-buffered.h"
34 #include "fs-io-direct.h"
40 #include "journal_reclaim.h"
41 #include "journal_seq_blacklist.h"
45 #include "nocow_locking.h"
47 #include "rebalance.h"
51 #include "sb-counters.h"
52 #include "sb-errors.h"
53 #include "sb-members.h"
55 #include "subvolume.h"
59 #include "thread_with_file.h"
62 #include <linux/backing-dev.h>
63 #include <linux/blkdev.h>
64 #include <linux/debugfs.h>
65 #include <linux/device.h>
66 #include <linux/idr.h>
67 #include <linux/module.h>
68 #include <linux/percpu.h>
69 #include <linux/random.h>
70 #include <linux/sysfs.h>
71 #include <crypto/hash.h>
73 MODULE_LICENSE("GPL");
74 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
75 MODULE_DESCRIPTION("bcachefs filesystem");
76 MODULE_SOFTDEP("pre: crc32c");
77 MODULE_SOFTDEP("pre: crc64");
78 MODULE_SOFTDEP("pre: sha256");
79 MODULE_SOFTDEP("pre: chacha20");
80 MODULE_SOFTDEP("pre: poly1305");
81 MODULE_SOFTDEP("pre: xxhash");
83 const char * const bch2_fs_flag_strs
[] = {
91 static void bch2_print_maybe_redirect(struct stdio_redirect
*stdio
, const char *fmt
, va_list args
)
94 if (unlikely(stdio
)) {
95 if (fmt
[0] == KERN_SOH
[0])
98 bch2_stdio_redirect_vprintf(stdio
, true, fmt
, args
);
105 void bch2_print_opts(struct bch_opts
*opts
, const char *fmt
, ...)
107 struct stdio_redirect
*stdio
= (void *)(unsigned long)opts
->stdio
;
111 bch2_print_maybe_redirect(stdio
, fmt
, args
);
115 void __bch2_print(struct bch_fs
*c
, const char *fmt
, ...)
117 struct stdio_redirect
*stdio
= bch2_fs_stdio_redirect(c
);
121 bch2_print_maybe_redirect(stdio
, fmt
, args
);
125 #define KTYPE(type) \
126 static const struct attribute_group type ## _group = { \
127 .attrs = type ## _files \
130 static const struct attribute_group *type ## _groups[] = { \
135 static const struct kobj_type type ## _ktype = { \
136 .release = type ## _release, \
137 .sysfs_ops = &type ## _sysfs_ops, \
138 .default_groups = type ## _groups \
141 static void bch2_fs_release(struct kobject
*);
142 static void bch2_dev_release(struct kobject
*);
143 static void bch2_fs_counters_release(struct kobject
*k
)
147 static void bch2_fs_internal_release(struct kobject
*k
)
151 static void bch2_fs_opts_dir_release(struct kobject
*k
)
155 static void bch2_fs_time_stats_release(struct kobject
*k
)
160 KTYPE(bch2_fs_counters
);
161 KTYPE(bch2_fs_internal
);
162 KTYPE(bch2_fs_opts_dir
);
163 KTYPE(bch2_fs_time_stats
);
166 static struct kset
*bcachefs_kset
;
167 static LIST_HEAD(bch_fs_list
);
168 static DEFINE_MUTEX(bch_fs_list_lock
);
170 DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait
);
172 static void bch2_dev_free(struct bch_dev
*);
173 static int bch2_dev_alloc(struct bch_fs
*, unsigned);
174 static int bch2_dev_sysfs_online(struct bch_fs
*, struct bch_dev
*);
175 static void __bch2_dev_read_only(struct bch_fs
*, struct bch_dev
*);
177 struct bch_fs
*bch2_dev_to_fs(dev_t dev
)
181 mutex_lock(&bch_fs_list_lock
);
184 list_for_each_entry(c
, &bch_fs_list
, list
)
185 for_each_member_device_rcu(c
, ca
, NULL
)
186 if (ca
->disk_sb
.bdev
&& ca
->disk_sb
.bdev
->bd_dev
== dev
) {
193 mutex_unlock(&bch_fs_list_lock
);
198 static struct bch_fs
*__bch2_uuid_to_fs(__uuid_t uuid
)
202 lockdep_assert_held(&bch_fs_list_lock
);
204 list_for_each_entry(c
, &bch_fs_list
, list
)
205 if (!memcmp(&c
->disk_sb
.sb
->uuid
, &uuid
, sizeof(uuid
)))
211 struct bch_fs
*bch2_uuid_to_fs(__uuid_t uuid
)
215 mutex_lock(&bch_fs_list_lock
);
216 c
= __bch2_uuid_to_fs(uuid
);
219 mutex_unlock(&bch_fs_list_lock
);
224 static void bch2_dev_usage_journal_reserve(struct bch_fs
*c
)
226 unsigned nr
= 0, u64s
=
227 ((sizeof(struct jset_entry_dev_usage
) +
228 sizeof(struct jset_entry_dev_usage_type
) * BCH_DATA_NR
)) /
232 for_each_member_device_rcu(c
, ca
, NULL
)
236 bch2_journal_entry_res_resize(&c
->journal
,
237 &c
->dev_usage_journal_res
, u64s
* nr
);
240 /* Filesystem RO/RW: */
243 * For startup/shutdown of RW stuff, the dependencies are:
245 * - foreground writes depend on copygc and rebalance (to free up space)
247 * - copygc and rebalance depend on mark and sweep gc (they actually probably
248 * don't because they either reserve ahead of time or don't block if
249 * allocations fail, but allocations can require mark and sweep gc to run
250 * because of generation number wraparound)
252 * - all of the above depends on the allocator threads
254 * - allocator depends on the journal (when it rewrites prios and gens)
257 static void __bch2_fs_read_only(struct bch_fs
*c
)
259 unsigned clean_passes
= 0;
263 bch2_open_buckets_stop(c
, NULL
, true);
264 bch2_rebalance_stop(c
);
266 bch2_gc_thread_stop(c
);
269 bch_verbose(c
, "flushing journal and stopping allocators, journal seq %llu",
270 journal_cur_seq(&c
->journal
));
275 if (bch2_btree_interior_updates_flush(c
) ||
276 bch2_journal_flush_all_pins(&c
->journal
) ||
277 bch2_btree_flush_all_writes(c
) ||
278 seq
!= atomic64_read(&c
->journal
.seq
)) {
279 seq
= atomic64_read(&c
->journal
.seq
);
282 } while (clean_passes
< 2);
284 bch_verbose(c
, "flushing journal and stopping allocators complete, journal seq %llu",
285 journal_cur_seq(&c
->journal
));
287 if (test_bit(JOURNAL_REPLAY_DONE
, &c
->journal
.flags
) &&
288 !test_bit(BCH_FS_emergency_ro
, &c
->flags
))
289 set_bit(BCH_FS_clean_shutdown
, &c
->flags
);
290 bch2_fs_journal_stop(&c
->journal
);
293 * After stopping journal:
295 for_each_member_device(c
, ca
)
296 bch2_dev_allocator_remove(c
, ca
);
299 #ifndef BCH_WRITE_REF_DEBUG
300 static void bch2_writes_disabled(struct percpu_ref
*writes
)
302 struct bch_fs
*c
= container_of(writes
, struct bch_fs
, writes
);
304 set_bit(BCH_FS_write_disable_complete
, &c
->flags
);
305 wake_up(&bch2_read_only_wait
);
309 void bch2_fs_read_only(struct bch_fs
*c
)
311 if (!test_bit(BCH_FS_rw
, &c
->flags
)) {
312 bch2_journal_reclaim_stop(&c
->journal
);
316 BUG_ON(test_bit(BCH_FS_write_disable_complete
, &c
->flags
));
318 bch_verbose(c
, "going read-only");
321 * Block new foreground-end write operations from starting - any new
322 * writes will return -EROFS:
324 set_bit(BCH_FS_going_ro
, &c
->flags
);
325 #ifndef BCH_WRITE_REF_DEBUG
326 percpu_ref_kill(&c
->writes
);
328 for (unsigned i
= 0; i
< BCH_WRITE_REF_NR
; i
++)
329 bch2_write_ref_put(c
, i
);
333 * If we're not doing an emergency shutdown, we want to wait on
334 * outstanding writes to complete so they don't see spurious errors due
335 * to shutting down the allocator:
337 * If we are doing an emergency shutdown outstanding writes may
338 * hang until we shutdown the allocator so we don't want to wait
339 * on outstanding writes before shutting everything down - but
340 * we do need to wait on them before returning and signalling
341 * that going RO is complete:
343 wait_event(bch2_read_only_wait
,
344 test_bit(BCH_FS_write_disable_complete
, &c
->flags
) ||
345 test_bit(BCH_FS_emergency_ro
, &c
->flags
));
347 bool writes_disabled
= test_bit(BCH_FS_write_disable_complete
, &c
->flags
);
349 bch_verbose(c
, "finished waiting for writes to stop");
351 __bch2_fs_read_only(c
);
353 wait_event(bch2_read_only_wait
,
354 test_bit(BCH_FS_write_disable_complete
, &c
->flags
));
356 if (!writes_disabled
)
357 bch_verbose(c
, "finished waiting for writes to stop");
359 clear_bit(BCH_FS_write_disable_complete
, &c
->flags
);
360 clear_bit(BCH_FS_going_ro
, &c
->flags
);
361 clear_bit(BCH_FS_rw
, &c
->flags
);
363 if (!bch2_journal_error(&c
->journal
) &&
364 !test_bit(BCH_FS_error
, &c
->flags
) &&
365 !test_bit(BCH_FS_emergency_ro
, &c
->flags
) &&
366 test_bit(BCH_FS_started
, &c
->flags
) &&
367 test_bit(BCH_FS_clean_shutdown
, &c
->flags
) &&
368 !c
->opts
.norecovery
) {
369 BUG_ON(c
->journal
.last_empty_seq
!= journal_cur_seq(&c
->journal
));
370 BUG_ON(atomic_read(&c
->btree_cache
.dirty
));
371 BUG_ON(atomic_long_read(&c
->btree_key_cache
.nr_dirty
));
372 BUG_ON(c
->btree_write_buffer
.inc
.keys
.nr
);
373 BUG_ON(c
->btree_write_buffer
.flushing
.keys
.nr
);
375 bch_verbose(c
, "marking filesystem clean");
376 bch2_fs_mark_clean(c
);
378 bch_verbose(c
, "done going read-only, filesystem not clean");
382 static void bch2_fs_read_only_work(struct work_struct
*work
)
385 container_of(work
, struct bch_fs
, read_only_work
);
387 down_write(&c
->state_lock
);
388 bch2_fs_read_only(c
);
389 up_write(&c
->state_lock
);
392 static void bch2_fs_read_only_async(struct bch_fs
*c
)
394 queue_work(system_long_wq
, &c
->read_only_work
);
397 bool bch2_fs_emergency_read_only(struct bch_fs
*c
)
399 bool ret
= !test_and_set_bit(BCH_FS_emergency_ro
, &c
->flags
);
401 bch2_journal_halt(&c
->journal
);
402 bch2_fs_read_only_async(c
);
404 wake_up(&bch2_read_only_wait
);
408 static int bch2_fs_read_write_late(struct bch_fs
*c
)
413 * Data move operations can't run until after check_snapshots has
414 * completed, and bch2_snapshot_is_ancestor() is available.
416 * Ideally we'd start copygc/rebalance earlier instead of waiting for
417 * all of recovery/fsck to complete:
419 ret
= bch2_copygc_start(c
);
421 bch_err(c
, "error starting copygc thread");
425 ret
= bch2_rebalance_start(c
);
427 bch_err(c
, "error starting rebalance thread");
434 static int __bch2_fs_read_write(struct bch_fs
*c
, bool early
)
438 if (test_bit(BCH_FS_initial_gc_unfixed
, &c
->flags
)) {
439 bch_err(c
, "cannot go rw, unfixed btree errors");
440 return -BCH_ERR_erofs_unfixed_errors
;
443 if (test_bit(BCH_FS_rw
, &c
->flags
))
446 bch_info(c
, "going read-write");
448 ret
= bch2_sb_members_v2_init(c
);
452 ret
= bch2_fs_mark_dirty(c
);
456 clear_bit(BCH_FS_clean_shutdown
, &c
->flags
);
459 * First journal write must be a flush write: after a clean shutdown we
460 * don't read the journal, so the first journal write may end up
461 * overwriting whatever was there previously, and there must always be
462 * at least one non-flush write in the journal or recovery will fail:
464 set_bit(JOURNAL_NEED_FLUSH_WRITE
, &c
->journal
.flags
);
466 for_each_rw_member(c
, ca
)
467 bch2_dev_allocator_add(c
, ca
);
468 bch2_recalc_capacity(c
);
470 set_bit(BCH_FS_rw
, &c
->flags
);
471 set_bit(BCH_FS_was_rw
, &c
->flags
);
473 #ifndef BCH_WRITE_REF_DEBUG
474 percpu_ref_reinit(&c
->writes
);
476 for (unsigned i
= 0; i
< BCH_WRITE_REF_NR
; i
++) {
477 BUG_ON(atomic_long_read(&c
->writes
[i
]));
478 atomic_long_inc(&c
->writes
[i
]);
482 ret
= bch2_gc_thread_start(c
);
484 bch_err(c
, "error starting gc thread");
488 ret
= bch2_journal_reclaim_start(&c
->journal
);
493 ret
= bch2_fs_read_write_late(c
);
499 bch2_do_invalidates(c
);
500 bch2_do_stripe_deletes(c
);
501 bch2_do_pending_node_rewrites(c
);
504 if (test_bit(BCH_FS_rw
, &c
->flags
))
505 bch2_fs_read_only(c
);
507 __bch2_fs_read_only(c
);
511 int bch2_fs_read_write(struct bch_fs
*c
)
513 if (c
->opts
.norecovery
)
514 return -BCH_ERR_erofs_norecovery
;
516 if (c
->opts
.nochanges
)
517 return -BCH_ERR_erofs_nochanges
;
519 return __bch2_fs_read_write(c
, false);
522 int bch2_fs_read_write_early(struct bch_fs
*c
)
524 lockdep_assert_held(&c
->state_lock
);
526 return __bch2_fs_read_write(c
, true);
529 /* Filesystem startup/shutdown: */
531 static void __bch2_fs_free(struct bch_fs
*c
)
535 for (i
= 0; i
< BCH_TIME_STAT_NR
; i
++)
536 bch2_time_stats_exit(&c
->times
[i
]);
538 bch2_free_pending_node_rewrites(c
);
539 bch2_fs_sb_errors_exit(c
);
540 bch2_fs_counters_exit(c
);
541 bch2_fs_snapshots_exit(c
);
542 bch2_fs_quota_exit(c
);
543 bch2_fs_fs_io_direct_exit(c
);
544 bch2_fs_fs_io_buffered_exit(c
);
545 bch2_fs_fsio_exit(c
);
547 bch2_fs_encryption_exit(c
);
548 bch2_fs_nocow_locking_exit(c
);
549 bch2_fs_io_write_exit(c
);
550 bch2_fs_io_read_exit(c
);
551 bch2_fs_buckets_waiting_for_journal_exit(c
);
552 bch2_fs_btree_interior_update_exit(c
);
553 bch2_fs_btree_iter_exit(c
);
554 bch2_fs_btree_key_cache_exit(&c
->btree_key_cache
);
555 bch2_fs_btree_cache_exit(c
);
556 bch2_fs_replicas_exit(c
);
557 bch2_fs_journal_exit(&c
->journal
);
558 bch2_io_clock_exit(&c
->io_clock
[WRITE
]);
559 bch2_io_clock_exit(&c
->io_clock
[READ
]);
560 bch2_fs_compress_exit(c
);
561 bch2_journal_keys_put_initial(c
);
562 BUG_ON(atomic_read(&c
->journal_keys
.ref
));
563 bch2_fs_btree_write_buffer_exit(c
);
564 percpu_free_rwsem(&c
->mark_lock
);
565 free_percpu(c
->online_reserved
);
567 darray_exit(&c
->btree_roots_extra
);
568 free_percpu(c
->pcpu
);
569 mempool_exit(&c
->large_bkey_pool
);
570 mempool_exit(&c
->btree_bounce_pool
);
571 bioset_exit(&c
->btree_bio
);
572 mempool_exit(&c
->fill_iter
);
573 #ifndef BCH_WRITE_REF_DEBUG
574 percpu_ref_exit(&c
->writes
);
576 kfree(rcu_dereference_protected(c
->disk_groups
, 1));
577 kfree(c
->journal_seq_blacklist_table
);
578 kfree(c
->unused_inode_hints
);
581 destroy_workqueue(c
->write_ref_wq
);
582 if (c
->io_complete_wq
)
583 destroy_workqueue(c
->io_complete_wq
);
585 destroy_workqueue(c
->copygc_wq
);
586 if (c
->btree_io_complete_wq
)
587 destroy_workqueue(c
->btree_io_complete_wq
);
588 if (c
->btree_update_wq
)
589 destroy_workqueue(c
->btree_update_wq
);
591 bch2_free_super(&c
->disk_sb
);
593 module_put(THIS_MODULE
);
596 static void bch2_fs_release(struct kobject
*kobj
)
598 struct bch_fs
*c
= container_of(kobj
, struct bch_fs
, kobj
);
603 void __bch2_fs_stop(struct bch_fs
*c
)
605 bch_verbose(c
, "shutting down");
607 set_bit(BCH_FS_stopping
, &c
->flags
);
609 cancel_work_sync(&c
->journal_seq_blacklist_gc_work
);
611 down_write(&c
->state_lock
);
612 bch2_fs_read_only(c
);
613 up_write(&c
->state_lock
);
615 for_each_member_device(c
, ca
)
616 if (ca
->kobj
.state_in_sysfs
&&
618 sysfs_remove_link(bdev_kobj(ca
->disk_sb
.bdev
), "bcachefs");
620 if (c
->kobj
.state_in_sysfs
)
621 kobject_del(&c
->kobj
);
623 bch2_fs_debug_exit(c
);
624 bch2_fs_chardev_exit(c
);
627 wait_event(c
->ro_ref_wait
, !refcount_read(&c
->ro_ref
));
629 kobject_put(&c
->counters_kobj
);
630 kobject_put(&c
->time_stats
);
631 kobject_put(&c
->opts_dir
);
632 kobject_put(&c
->internal
);
634 /* btree prefetch might have kicked off reads in the background: */
635 bch2_btree_flush_all_reads(c
);
637 for_each_member_device(c
, ca
)
638 cancel_work_sync(&ca
->io_error_work
);
640 cancel_work_sync(&c
->read_only_work
);
643 void bch2_fs_free(struct bch_fs
*c
)
647 mutex_lock(&bch_fs_list_lock
);
649 mutex_unlock(&bch_fs_list_lock
);
651 closure_sync(&c
->cl
);
652 closure_debug_destroy(&c
->cl
);
654 for (i
= 0; i
< c
->sb
.nr_devices
; i
++) {
655 struct bch_dev
*ca
= rcu_dereference_protected(c
->devs
[i
], true);
658 bch2_free_super(&ca
->disk_sb
);
663 bch_verbose(c
, "shutdown complete");
665 kobject_put(&c
->kobj
);
668 void bch2_fs_stop(struct bch_fs
*c
)
674 static int bch2_fs_online(struct bch_fs
*c
)
678 lockdep_assert_held(&bch_fs_list_lock
);
680 if (__bch2_uuid_to_fs(c
->sb
.uuid
)) {
681 bch_err(c
, "filesystem UUID already open");
685 ret
= bch2_fs_chardev_init(c
);
687 bch_err(c
, "error creating character device");
691 bch2_fs_debug_init(c
);
693 ret
= kobject_add(&c
->kobj
, NULL
, "%pU", c
->sb
.user_uuid
.b
) ?:
694 kobject_add(&c
->internal
, &c
->kobj
, "internal") ?:
695 kobject_add(&c
->opts_dir
, &c
->kobj
, "options") ?:
696 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
697 kobject_add(&c
->time_stats
, &c
->kobj
, "time_stats") ?:
699 kobject_add(&c
->counters_kobj
, &c
->kobj
, "counters") ?:
700 bch2_opts_create_sysfs_files(&c
->opts_dir
);
702 bch_err(c
, "error creating sysfs objects");
706 down_write(&c
->state_lock
);
708 for_each_member_device(c
, ca
) {
709 ret
= bch2_dev_sysfs_online(c
, ca
);
711 bch_err(c
, "error creating sysfs objects");
712 percpu_ref_put(&ca
->ref
);
717 BUG_ON(!list_empty(&c
->list
));
718 list_add(&c
->list
, &bch_fs_list
);
720 up_write(&c
->state_lock
);
724 static struct bch_fs
*bch2_fs_alloc(struct bch_sb
*sb
, struct bch_opts opts
)
727 struct printbuf name
= PRINTBUF
;
728 unsigned i
, iter_size
;
731 c
= kvmalloc(sizeof(struct bch_fs
), GFP_KERNEL
|__GFP_ZERO
);
733 c
= ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc
);
737 c
->stdio
= (void *)(unsigned long) opts
.stdio
;
739 __module_get(THIS_MODULE
);
741 closure_init(&c
->cl
, NULL
);
743 c
->kobj
.kset
= bcachefs_kset
;
744 kobject_init(&c
->kobj
, &bch2_fs_ktype
);
745 kobject_init(&c
->internal
, &bch2_fs_internal_ktype
);
746 kobject_init(&c
->opts_dir
, &bch2_fs_opts_dir_ktype
);
747 kobject_init(&c
->time_stats
, &bch2_fs_time_stats_ktype
);
748 kobject_init(&c
->counters_kobj
, &bch2_fs_counters_ktype
);
751 c
->disk_sb
.fs_sb
= true;
753 init_rwsem(&c
->state_lock
);
754 mutex_init(&c
->sb_lock
);
755 mutex_init(&c
->replicas_gc_lock
);
756 mutex_init(&c
->btree_root_lock
);
757 INIT_WORK(&c
->read_only_work
, bch2_fs_read_only_work
);
759 refcount_set(&c
->ro_ref
, 1);
760 init_waitqueue_head(&c
->ro_ref_wait
);
761 sema_init(&c
->online_fsck_mutex
, 1);
763 init_rwsem(&c
->gc_lock
);
764 mutex_init(&c
->gc_gens_lock
);
765 atomic_set(&c
->journal_keys
.ref
, 1);
766 c
->journal_keys
.initial_ref_held
= true;
768 for (i
= 0; i
< BCH_TIME_STAT_NR
; i
++)
769 bch2_time_stats_init(&c
->times
[i
]);
771 bch2_fs_copygc_init(c
);
772 bch2_fs_btree_key_cache_init_early(&c
->btree_key_cache
);
773 bch2_fs_btree_iter_init_early(c
);
774 bch2_fs_btree_interior_update_init_early(c
);
775 bch2_fs_allocator_background_init(c
);
776 bch2_fs_allocator_foreground_init(c
);
777 bch2_fs_rebalance_init(c
);
778 bch2_fs_quota_init(c
);
779 bch2_fs_ec_init_early(c
);
780 bch2_fs_move_init(c
);
781 bch2_fs_sb_errors_init_early(c
);
783 INIT_LIST_HEAD(&c
->list
);
785 mutex_init(&c
->usage_scratch_lock
);
787 mutex_init(&c
->bio_bounce_pages_lock
);
788 mutex_init(&c
->snapshot_table_lock
);
789 init_rwsem(&c
->snapshot_create_lock
);
791 spin_lock_init(&c
->btree_write_error_lock
);
793 INIT_WORK(&c
->journal_seq_blacklist_gc_work
,
794 bch2_blacklist_entries_gc
);
796 INIT_LIST_HEAD(&c
->journal_iters
);
798 INIT_LIST_HEAD(&c
->fsck_error_msgs
);
799 mutex_init(&c
->fsck_error_msgs_lock
);
801 seqcount_init(&c
->gc_pos_lock
);
803 seqcount_init(&c
->usage_lock
);
805 sema_init(&c
->io_in_flight
, 128);
807 INIT_LIST_HEAD(&c
->vfs_inodes_list
);
808 mutex_init(&c
->vfs_inodes_lock
);
810 c
->copy_gc_enabled
= 1;
811 c
->rebalance
.enabled
= 1;
812 c
->promote_whole_extents
= true;
814 c
->journal
.flush_write_time
= &c
->times
[BCH_TIME_journal_flush_write
];
815 c
->journal
.noflush_write_time
= &c
->times
[BCH_TIME_journal_noflush_write
];
816 c
->journal
.flush_seq_time
= &c
->times
[BCH_TIME_journal_flush_seq
];
818 bch2_fs_btree_cache_init_early(&c
->btree_cache
);
820 mutex_init(&c
->sectors_available_lock
);
822 ret
= percpu_init_rwsem(&c
->mark_lock
);
826 mutex_lock(&c
->sb_lock
);
827 ret
= bch2_sb_to_fs(c
, sb
);
828 mutex_unlock(&c
->sb_lock
);
833 pr_uuid(&name
, c
->sb
.user_uuid
.b
);
834 ret
= name
.allocation_failure
? -BCH_ERR_ENOMEM_fs_name_alloc
: 0;
838 strscpy(c
->name
, name
.buf
, sizeof(c
->name
));
839 printbuf_exit(&name
);
842 if (le16_to_cpu(sb
->version
) <= bcachefs_metadata_version_inode_v2
&&
843 !BCH_SB_JOURNAL_FLUSH_DELAY(sb
))
844 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb
, 1000);
846 if (le16_to_cpu(sb
->version
) <= bcachefs_metadata_version_inode_v2
&&
847 !BCH_SB_JOURNAL_RECLAIM_DELAY(sb
))
848 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb
, 100);
850 c
->opts
= bch2_opts_default
;
851 ret
= bch2_opts_from_sb(&c
->opts
, sb
);
855 bch2_opts_apply(&c
->opts
, opts
);
857 c
->btree_key_cache_btrees
|= 1U << BTREE_ID_alloc
;
858 if (c
->opts
.inodes_use_key_cache
)
859 c
->btree_key_cache_btrees
|= 1U << BTREE_ID_inodes
;
860 c
->btree_key_cache_btrees
|= 1U << BTREE_ID_logged_ops
;
862 c
->block_bits
= ilog2(block_sectors(c
));
863 c
->btree_foreground_merge_threshold
= BTREE_FOREGROUND_MERGE_THRESHOLD(c
);
865 if (bch2_fs_init_fault("fs_alloc")) {
866 bch_err(c
, "fs_alloc fault injected");
871 iter_size
= sizeof(struct sort_iter
) +
872 (btree_blocks(c
) + 1) * 2 *
873 sizeof(struct sort_iter_set
);
875 c
->inode_shard_bits
= ilog2(roundup_pow_of_two(num_possible_cpus()));
877 if (!(c
->btree_update_wq
= alloc_workqueue("bcachefs",
878 WQ_HIGHPRI
|WQ_FREEZABLE
|WQ_MEM_RECLAIM
|WQ_UNBOUND
, 512)) ||
879 !(c
->btree_io_complete_wq
= alloc_workqueue("bcachefs_btree_io",
880 WQ_HIGHPRI
|WQ_FREEZABLE
|WQ_MEM_RECLAIM
, 1)) ||
881 !(c
->copygc_wq
= alloc_workqueue("bcachefs_copygc",
882 WQ_HIGHPRI
|WQ_FREEZABLE
|WQ_MEM_RECLAIM
|WQ_CPU_INTENSIVE
, 1)) ||
883 !(c
->io_complete_wq
= alloc_workqueue("bcachefs_io",
884 WQ_HIGHPRI
|WQ_FREEZABLE
|WQ_MEM_RECLAIM
, 512)) ||
885 !(c
->write_ref_wq
= alloc_workqueue("bcachefs_write_ref",
887 #ifndef BCH_WRITE_REF_DEBUG
888 percpu_ref_init(&c
->writes
, bch2_writes_disabled
,
889 PERCPU_REF_INIT_DEAD
, GFP_KERNEL
) ||
891 mempool_init_kmalloc_pool(&c
->fill_iter
, 1, iter_size
) ||
892 bioset_init(&c
->btree_bio
, 1,
893 max(offsetof(struct btree_read_bio
, bio
),
894 offsetof(struct btree_write_bio
, wbio
.bio
)),
895 BIOSET_NEED_BVECS
) ||
896 !(c
->pcpu
= alloc_percpu(struct bch_fs_pcpu
)) ||
897 !(c
->online_reserved
= alloc_percpu(u64
)) ||
898 mempool_init_kvmalloc_pool(&c
->btree_bounce_pool
, 1,
899 c
->opts
.btree_node_size
) ||
900 mempool_init_kmalloc_pool(&c
->large_bkey_pool
, 1, 2048) ||
901 !(c
->unused_inode_hints
= kcalloc(1U << c
->inode_shard_bits
,
902 sizeof(u64
), GFP_KERNEL
))) {
903 ret
= -BCH_ERR_ENOMEM_fs_other_alloc
;
907 ret
= bch2_fs_counters_init(c
) ?:
908 bch2_fs_sb_errors_init(c
) ?:
909 bch2_io_clock_init(&c
->io_clock
[READ
]) ?:
910 bch2_io_clock_init(&c
->io_clock
[WRITE
]) ?:
911 bch2_fs_journal_init(&c
->journal
) ?:
912 bch2_fs_replicas_init(c
) ?:
913 bch2_fs_btree_cache_init(c
) ?:
914 bch2_fs_btree_key_cache_init(&c
->btree_key_cache
) ?:
915 bch2_fs_btree_iter_init(c
) ?:
916 bch2_fs_btree_interior_update_init(c
) ?:
917 bch2_fs_buckets_waiting_for_journal_init(c
) ?:
918 bch2_fs_btree_write_buffer_init(c
) ?:
919 bch2_fs_subvolumes_init(c
) ?:
920 bch2_fs_io_read_init(c
) ?:
921 bch2_fs_io_write_init(c
) ?:
922 bch2_fs_nocow_locking_init(c
) ?:
923 bch2_fs_encryption_init(c
) ?:
924 bch2_fs_compress_init(c
) ?:
925 bch2_fs_ec_init(c
) ?:
926 bch2_fs_fsio_init(c
) ?:
927 bch2_fs_fs_io_buffered_init(c
) ?:
928 bch2_fs_fs_io_direct_init(c
);
932 for (i
= 0; i
< c
->sb
.nr_devices
; i
++)
933 if (bch2_dev_exists(c
->disk_sb
.sb
, i
) &&
934 bch2_dev_alloc(c
, i
)) {
939 bch2_journal_entry_res_resize(&c
->journal
,
940 &c
->btree_root_journal_res
,
941 BTREE_ID_NR
* (JSET_KEYS_U64s
+ BKEY_BTREE_PTR_U64s_MAX
));
942 bch2_dev_usage_journal_reserve(c
);
943 bch2_journal_entry_res_resize(&c
->journal
,
944 &c
->clock_journal_res
,
945 (sizeof(struct jset_entry_clock
) / sizeof(u64
)) * 2);
947 mutex_lock(&bch_fs_list_lock
);
948 ret
= bch2_fs_online(c
);
949 mutex_unlock(&bch_fs_list_lock
);
962 static void print_mount_opts(struct bch_fs
*c
)
965 struct printbuf p
= PRINTBUF
;
968 prt_str(&p
, "mounting version ");
969 bch2_version_to_text(&p
, c
->sb
.version
);
971 if (c
->opts
.read_only
) {
972 prt_str(&p
, " opts=");
974 prt_printf(&p
, "ro");
977 for (i
= 0; i
< bch2_opts_nr
; i
++) {
978 const struct bch_option
*opt
= &bch2_opt_table
[i
];
979 u64 v
= bch2_opt_get_by_id(&c
->opts
, i
);
981 if (!(opt
->flags
& OPT_MOUNT
))
984 if (v
== bch2_opt_get_by_id(&bch2_opts_default
, i
))
987 prt_str(&p
, first
? " opts=" : ",");
989 bch2_opt_to_text(&p
, c
, c
->disk_sb
.sb
, opt
, v
, OPT_SHOW_MOUNT_STYLE
);
992 bch_info(c
, "%s", p
.buf
);
996 int bch2_fs_start(struct bch_fs
*c
)
998 time64_t now
= ktime_get_real_seconds();
1001 print_mount_opts(c
);
1003 down_write(&c
->state_lock
);
1005 BUG_ON(test_bit(BCH_FS_started
, &c
->flags
));
1007 mutex_lock(&c
->sb_lock
);
1009 ret
= bch2_sb_members_v2_init(c
);
1011 mutex_unlock(&c
->sb_lock
);
1015 for_each_online_member(c
, ca
)
1016 bch2_members_v2_get_mut(c
->disk_sb
.sb
, ca
->dev_idx
)->last_mount
= cpu_to_le64(now
);
1018 mutex_unlock(&c
->sb_lock
);
1020 for_each_rw_member(c
, ca
)
1021 bch2_dev_allocator_add(c
, ca
);
1022 bch2_recalc_capacity(c
);
1024 ret
= BCH_SB_INITIALIZED(c
->disk_sb
.sb
)
1025 ? bch2_fs_recovery(c
)
1026 : bch2_fs_initialize(c
);
1030 ret
= bch2_opts_check_may_set(c
);
1034 if (bch2_fs_init_fault("fs_start")) {
1035 bch_err(c
, "fs_start fault injected");
1040 set_bit(BCH_FS_started
, &c
->flags
);
1042 if (c
->opts
.read_only
) {
1043 bch2_fs_read_only(c
);
1045 ret
= !test_bit(BCH_FS_rw
, &c
->flags
)
1046 ? bch2_fs_read_write(c
)
1047 : bch2_fs_read_write_late(c
);
1055 bch_err_msg(c
, ret
, "starting filesystem");
1057 bch_verbose(c
, "done starting filesystem");
1058 up_write(&c
->state_lock
);
1062 static int bch2_dev_may_add(struct bch_sb
*sb
, struct bch_fs
*c
)
1064 struct bch_member m
= bch2_sb_member_get(sb
, sb
->dev_idx
);
1066 if (le16_to_cpu(sb
->block_size
) != block_sectors(c
))
1067 return -BCH_ERR_mismatched_block_size
;
1069 if (le16_to_cpu(m
.bucket_size
) <
1070 BCH_SB_BTREE_NODE_SIZE(c
->disk_sb
.sb
))
1071 return -BCH_ERR_bucket_size_too_small
;
1076 static int bch2_dev_in_fs(struct bch_sb_handle
*fs
,
1077 struct bch_sb_handle
*sb
,
1078 struct bch_opts
*opts
)
1083 if (!uuid_equal(&fs
->sb
->uuid
, &sb
->sb
->uuid
))
1084 return -BCH_ERR_device_not_a_member_of_filesystem
;
1086 if (!bch2_dev_exists(fs
->sb
, sb
->sb
->dev_idx
))
1087 return -BCH_ERR_device_has_been_removed
;
1089 if (fs
->sb
->block_size
!= sb
->sb
->block_size
)
1090 return -BCH_ERR_mismatched_block_size
;
1092 if (le16_to_cpu(fs
->sb
->version
) < bcachefs_metadata_version_member_seq
||
1093 le16_to_cpu(sb
->sb
->version
) < bcachefs_metadata_version_member_seq
)
1096 if (fs
->sb
->seq
== sb
->sb
->seq
&&
1097 fs
->sb
->write_time
!= sb
->sb
->write_time
) {
1098 struct printbuf buf
= PRINTBUF
;
1100 prt_str(&buf
, "Split brain detected between ");
1101 prt_bdevname(&buf
, sb
->bdev
);
1102 prt_str(&buf
, " and ");
1103 prt_bdevname(&buf
, fs
->bdev
);
1104 prt_char(&buf
, ':');
1106 prt_printf(&buf
, "seq=%llu but write_time different, got", le64_to_cpu(sb
->sb
->seq
));
1109 prt_bdevname(&buf
, fs
->bdev
);
1110 prt_char(&buf
, ' ');
1111 bch2_prt_datetime(&buf
, le64_to_cpu(fs
->sb
->write_time
));;
1114 prt_bdevname(&buf
, sb
->bdev
);
1115 prt_char(&buf
, ' ');
1116 bch2_prt_datetime(&buf
, le64_to_cpu(sb
->sb
->write_time
));;
1119 if (!opts
->no_splitbrain_check
)
1120 prt_printf(&buf
, "Not using older sb");
1122 pr_err("%s", buf
.buf
);
1123 printbuf_exit(&buf
);
1125 if (!opts
->no_splitbrain_check
)
1126 return -BCH_ERR_device_splitbrain
;
1129 struct bch_member m
= bch2_sb_member_get(fs
->sb
, sb
->sb
->dev_idx
);
1130 u64 seq_from_fs
= le64_to_cpu(m
.seq
);
1131 u64 seq_from_member
= le64_to_cpu(sb
->sb
->seq
);
1133 if (seq_from_fs
&& seq_from_fs
< seq_from_member
) {
1134 struct printbuf buf
= PRINTBUF
;
1136 prt_str(&buf
, "Split brain detected between ");
1137 prt_bdevname(&buf
, sb
->bdev
);
1138 prt_str(&buf
, " and ");
1139 prt_bdevname(&buf
, fs
->bdev
);
1140 prt_char(&buf
, ':');
1143 prt_bdevname(&buf
, fs
->bdev
);
1144 prt_str(&buf
, " believes seq of ");
1145 prt_bdevname(&buf
, sb
->bdev
);
1146 prt_printf(&buf
, " to be %llu, but ", seq_from_fs
);
1147 prt_bdevname(&buf
, sb
->bdev
);
1148 prt_printf(&buf
, " has %llu\n", seq_from_member
);
1150 if (!opts
->no_splitbrain_check
) {
1151 prt_str(&buf
, "Not using ");
1152 prt_bdevname(&buf
, sb
->bdev
);
1155 pr_err("%s", buf
.buf
);
1156 printbuf_exit(&buf
);
1158 if (!opts
->no_splitbrain_check
)
1159 return -BCH_ERR_device_splitbrain
;
1165 /* Device startup/shutdown: */
1167 static void bch2_dev_release(struct kobject
*kobj
)
1169 struct bch_dev
*ca
= container_of(kobj
, struct bch_dev
, kobj
);
1174 static void bch2_dev_free(struct bch_dev
*ca
)
1176 cancel_work_sync(&ca
->io_error_work
);
1178 if (ca
->kobj
.state_in_sysfs
&&
1180 sysfs_remove_link(bdev_kobj(ca
->disk_sb
.bdev
), "bcachefs");
1182 if (ca
->kobj
.state_in_sysfs
)
1183 kobject_del(&ca
->kobj
);
1185 bch2_free_super(&ca
->disk_sb
);
1186 bch2_dev_journal_exit(ca
);
1188 free_percpu(ca
->io_done
);
1189 bioset_exit(&ca
->replica_set
);
1190 bch2_dev_buckets_free(ca
);
1191 free_page((unsigned long) ca
->sb_read_scratch
);
1193 bch2_time_stats_quantiles_exit(&ca
->io_latency
[WRITE
]);
1194 bch2_time_stats_quantiles_exit(&ca
->io_latency
[READ
]);
1196 percpu_ref_exit(&ca
->io_ref
);
1197 percpu_ref_exit(&ca
->ref
);
1198 kobject_put(&ca
->kobj
);
1201 static void __bch2_dev_offline(struct bch_fs
*c
, struct bch_dev
*ca
)
1204 lockdep_assert_held(&c
->state_lock
);
1206 if (percpu_ref_is_zero(&ca
->io_ref
))
1209 __bch2_dev_read_only(c
, ca
);
1211 reinit_completion(&ca
->io_ref_completion
);
1212 percpu_ref_kill(&ca
->io_ref
);
1213 wait_for_completion(&ca
->io_ref_completion
);
1215 if (ca
->kobj
.state_in_sysfs
) {
1216 sysfs_remove_link(bdev_kobj(ca
->disk_sb
.bdev
), "bcachefs");
1217 sysfs_remove_link(&ca
->kobj
, "block");
1220 bch2_free_super(&ca
->disk_sb
);
1221 bch2_dev_journal_exit(ca
);
1224 static void bch2_dev_ref_complete(struct percpu_ref
*ref
)
1226 struct bch_dev
*ca
= container_of(ref
, struct bch_dev
, ref
);
1228 complete(&ca
->ref_completion
);
1231 static void bch2_dev_io_ref_complete(struct percpu_ref
*ref
)
1233 struct bch_dev
*ca
= container_of(ref
, struct bch_dev
, io_ref
);
1235 complete(&ca
->io_ref_completion
);
1238 static int bch2_dev_sysfs_online(struct bch_fs
*c
, struct bch_dev
*ca
)
1242 if (!c
->kobj
.state_in_sysfs
)
1245 if (!ca
->kobj
.state_in_sysfs
) {
1246 ret
= kobject_add(&ca
->kobj
, &c
->kobj
,
1247 "dev-%u", ca
->dev_idx
);
1252 if (ca
->disk_sb
.bdev
) {
1253 struct kobject
*block
= bdev_kobj(ca
->disk_sb
.bdev
);
1255 ret
= sysfs_create_link(block
, &ca
->kobj
, "bcachefs");
1259 ret
= sysfs_create_link(&ca
->kobj
, block
, "block");
1267 static struct bch_dev
*__bch2_dev_alloc(struct bch_fs
*c
,
1268 struct bch_member
*member
)
1273 ca
= kzalloc(sizeof(*ca
), GFP_KERNEL
);
1277 kobject_init(&ca
->kobj
, &bch2_dev_ktype
);
1278 init_completion(&ca
->ref_completion
);
1279 init_completion(&ca
->io_ref_completion
);
1281 init_rwsem(&ca
->bucket_lock
);
1283 INIT_WORK(&ca
->io_error_work
, bch2_io_error_work
);
1285 bch2_time_stats_quantiles_init(&ca
->io_latency
[READ
]);
1286 bch2_time_stats_quantiles_init(&ca
->io_latency
[WRITE
]);
1288 ca
->mi
= bch2_mi_to_cpu(member
);
1290 for (i
= 0; i
< ARRAY_SIZE(member
->errors
); i
++)
1291 atomic64_set(&ca
->errors
[i
], le64_to_cpu(member
->errors
[i
]));
1293 ca
->uuid
= member
->uuid
;
1295 ca
->nr_btree_reserve
= DIV_ROUND_UP(BTREE_NODE_RESERVE
,
1296 ca
->mi
.bucket_size
/ btree_sectors(c
));
1298 if (percpu_ref_init(&ca
->ref
, bch2_dev_ref_complete
,
1300 percpu_ref_init(&ca
->io_ref
, bch2_dev_io_ref_complete
,
1301 PERCPU_REF_INIT_DEAD
, GFP_KERNEL
) ||
1302 !(ca
->sb_read_scratch
= (void *) __get_free_page(GFP_KERNEL
)) ||
1303 bch2_dev_buckets_alloc(c
, ca
) ||
1304 bioset_init(&ca
->replica_set
, 4,
1305 offsetof(struct bch_write_bio
, bio
), 0) ||
1306 !(ca
->io_done
= alloc_percpu(*ca
->io_done
)))
1315 static void bch2_dev_attach(struct bch_fs
*c
, struct bch_dev
*ca
,
1318 ca
->dev_idx
= dev_idx
;
1319 __set_bit(ca
->dev_idx
, ca
->self
.d
);
1320 scnprintf(ca
->name
, sizeof(ca
->name
), "dev-%u", dev_idx
);
1323 rcu_assign_pointer(c
->devs
[ca
->dev_idx
], ca
);
1325 if (bch2_dev_sysfs_online(c
, ca
))
1326 pr_warn("error creating sysfs objects");
1329 static int bch2_dev_alloc(struct bch_fs
*c
, unsigned dev_idx
)
1331 struct bch_member member
= bch2_sb_member_get(c
->disk_sb
.sb
, dev_idx
);
1332 struct bch_dev
*ca
= NULL
;
1335 if (bch2_fs_init_fault("dev_alloc"))
1338 ca
= __bch2_dev_alloc(c
, &member
);
1344 bch2_dev_attach(c
, ca
, dev_idx
);
1349 return -BCH_ERR_ENOMEM_dev_alloc
;
1352 static int __bch2_dev_attach_bdev(struct bch_dev
*ca
, struct bch_sb_handle
*sb
)
1356 if (bch2_dev_is_online(ca
)) {
1357 bch_err(ca
, "already have device online in slot %u",
1359 return -BCH_ERR_device_already_online
;
1362 if (get_capacity(sb
->bdev
->bd_disk
) <
1363 ca
->mi
.bucket_size
* ca
->mi
.nbuckets
) {
1364 bch_err(ca
, "cannot online: device too small");
1365 return -BCH_ERR_device_size_too_small
;
1368 BUG_ON(!percpu_ref_is_zero(&ca
->io_ref
));
1370 ret
= bch2_dev_journal_init(ca
, sb
->sb
);
1376 memset(sb
, 0, sizeof(*sb
));
1378 ca
->dev
= ca
->disk_sb
.bdev
->bd_dev
;
1380 percpu_ref_reinit(&ca
->io_ref
);
1385 static int bch2_dev_attach_bdev(struct bch_fs
*c
, struct bch_sb_handle
*sb
)
1390 lockdep_assert_held(&c
->state_lock
);
1392 if (le64_to_cpu(sb
->sb
->seq
) >
1393 le64_to_cpu(c
->disk_sb
.sb
->seq
))
1394 bch2_sb_to_fs(c
, sb
->sb
);
1396 BUG_ON(sb
->sb
->dev_idx
>= c
->sb
.nr_devices
||
1397 !c
->devs
[sb
->sb
->dev_idx
]);
1399 ca
= bch_dev_locked(c
, sb
->sb
->dev_idx
);
1401 ret
= __bch2_dev_attach_bdev(ca
, sb
);
1405 bch2_dev_sysfs_online(c
, ca
);
1407 struct printbuf name
= PRINTBUF
;
1408 prt_bdevname(&name
, ca
->disk_sb
.bdev
);
1410 if (c
->sb
.nr_devices
== 1)
1411 strscpy(c
->name
, name
.buf
, sizeof(c
->name
));
1412 strscpy(ca
->name
, name
.buf
, sizeof(ca
->name
));
1414 printbuf_exit(&name
);
1416 rebalance_wakeup(c
);
1420 /* Device management: */
1423 * Note: this function is also used by the error paths - when a particular
1424 * device sees an error, we call it to determine whether we can just set the
1425 * device RO, or - if this function returns false - we'll set the whole
1428 * XXX: maybe we should be more explicit about whether we're changing state
1429 * because we got an error or what have you?
1431 bool bch2_dev_state_allowed(struct bch_fs
*c
, struct bch_dev
*ca
,
1432 enum bch_member_state new_state
, int flags
)
1434 struct bch_devs_mask new_online_devs
;
1435 int nr_rw
= 0, required
;
1437 lockdep_assert_held(&c
->state_lock
);
1439 switch (new_state
) {
1440 case BCH_MEMBER_STATE_rw
:
1442 case BCH_MEMBER_STATE_ro
:
1443 if (ca
->mi
.state
!= BCH_MEMBER_STATE_rw
)
1446 /* do we have enough devices to write to? */
1447 for_each_member_device(c
, ca2
)
1449 nr_rw
+= ca2
->mi
.state
== BCH_MEMBER_STATE_rw
;
1451 required
= max(!(flags
& BCH_FORCE_IF_METADATA_DEGRADED
)
1452 ? c
->opts
.metadata_replicas
1453 : metadata_replicas_required(c
),
1454 !(flags
& BCH_FORCE_IF_DATA_DEGRADED
)
1455 ? c
->opts
.data_replicas
1456 : data_replicas_required(c
));
1458 return nr_rw
>= required
;
1459 case BCH_MEMBER_STATE_failed
:
1460 case BCH_MEMBER_STATE_spare
:
1461 if (ca
->mi
.state
!= BCH_MEMBER_STATE_rw
&&
1462 ca
->mi
.state
!= BCH_MEMBER_STATE_ro
)
1465 /* do we have enough devices to read from? */
1466 new_online_devs
= bch2_online_devs(c
);
1467 __clear_bit(ca
->dev_idx
, new_online_devs
.d
);
1469 return bch2_have_enough_devs(c
, new_online_devs
, flags
, false);
1475 static bool bch2_fs_may_start(struct bch_fs
*c
)
1478 unsigned i
, flags
= 0;
1480 if (c
->opts
.very_degraded
)
1481 flags
|= BCH_FORCE_IF_DEGRADED
|BCH_FORCE_IF_LOST
;
1483 if (c
->opts
.degraded
)
1484 flags
|= BCH_FORCE_IF_DEGRADED
;
1486 if (!c
->opts
.degraded
&&
1487 !c
->opts
.very_degraded
) {
1488 mutex_lock(&c
->sb_lock
);
1490 for (i
= 0; i
< c
->disk_sb
.sb
->nr_devices
; i
++) {
1491 if (!bch2_dev_exists(c
->disk_sb
.sb
, i
))
1494 ca
= bch_dev_locked(c
, i
);
1496 if (!bch2_dev_is_online(ca
) &&
1497 (ca
->mi
.state
== BCH_MEMBER_STATE_rw
||
1498 ca
->mi
.state
== BCH_MEMBER_STATE_ro
)) {
1499 mutex_unlock(&c
->sb_lock
);
1503 mutex_unlock(&c
->sb_lock
);
1506 return bch2_have_enough_devs(c
, bch2_online_devs(c
), flags
, true);
1509 static void __bch2_dev_read_only(struct bch_fs
*c
, struct bch_dev
*ca
)
1512 * The allocator thread itself allocates btree nodes, so stop it first:
1514 bch2_dev_allocator_remove(c
, ca
);
1515 bch2_dev_journal_stop(&c
->journal
, ca
);
1518 static void __bch2_dev_read_write(struct bch_fs
*c
, struct bch_dev
*ca
)
1520 lockdep_assert_held(&c
->state_lock
);
1522 BUG_ON(ca
->mi
.state
!= BCH_MEMBER_STATE_rw
);
1524 bch2_dev_allocator_add(c
, ca
);
1525 bch2_recalc_capacity(c
);
1528 int __bch2_dev_set_state(struct bch_fs
*c
, struct bch_dev
*ca
,
1529 enum bch_member_state new_state
, int flags
)
1531 struct bch_member
*m
;
1534 if (ca
->mi
.state
== new_state
)
1537 if (!bch2_dev_state_allowed(c
, ca
, new_state
, flags
))
1538 return -BCH_ERR_device_state_not_allowed
;
1540 if (new_state
!= BCH_MEMBER_STATE_rw
)
1541 __bch2_dev_read_only(c
, ca
);
1543 bch_notice(ca
, "%s", bch2_member_states
[new_state
]);
1545 mutex_lock(&c
->sb_lock
);
1546 m
= bch2_members_v2_get_mut(c
->disk_sb
.sb
, ca
->dev_idx
);
1547 SET_BCH_MEMBER_STATE(m
, new_state
);
1548 bch2_write_super(c
);
1549 mutex_unlock(&c
->sb_lock
);
1551 if (new_state
== BCH_MEMBER_STATE_rw
)
1552 __bch2_dev_read_write(c
, ca
);
1554 rebalance_wakeup(c
);
1559 int bch2_dev_set_state(struct bch_fs
*c
, struct bch_dev
*ca
,
1560 enum bch_member_state new_state
, int flags
)
1564 down_write(&c
->state_lock
);
1565 ret
= __bch2_dev_set_state(c
, ca
, new_state
, flags
);
1566 up_write(&c
->state_lock
);
1571 /* Device add/removal: */
1573 static int bch2_dev_remove_alloc(struct bch_fs
*c
, struct bch_dev
*ca
)
1575 struct bpos start
= POS(ca
->dev_idx
, 0);
1576 struct bpos end
= POS(ca
->dev_idx
, U64_MAX
);
1580 * We clear the LRU and need_discard btrees first so that we don't race
1581 * with bch2_do_invalidates() and bch2_do_discards()
1583 ret
= bch2_btree_delete_range(c
, BTREE_ID_lru
, start
, end
,
1584 BTREE_TRIGGER_NORUN
, NULL
) ?:
1585 bch2_btree_delete_range(c
, BTREE_ID_need_discard
, start
, end
,
1586 BTREE_TRIGGER_NORUN
, NULL
) ?:
1587 bch2_btree_delete_range(c
, BTREE_ID_freespace
, start
, end
,
1588 BTREE_TRIGGER_NORUN
, NULL
) ?:
1589 bch2_btree_delete_range(c
, BTREE_ID_backpointers
, start
, end
,
1590 BTREE_TRIGGER_NORUN
, NULL
) ?:
1591 bch2_btree_delete_range(c
, BTREE_ID_alloc
, start
, end
,
1592 BTREE_TRIGGER_NORUN
, NULL
) ?:
1593 bch2_btree_delete_range(c
, BTREE_ID_bucket_gens
, start
, end
,
1594 BTREE_TRIGGER_NORUN
, NULL
);
1595 bch_err_msg(c
, ret
, "removing dev alloc info");
1599 int bch2_dev_remove(struct bch_fs
*c
, struct bch_dev
*ca
, int flags
)
1601 struct bch_member
*m
;
1602 unsigned dev_idx
= ca
->dev_idx
, data
;
1605 down_write(&c
->state_lock
);
1608 * We consume a reference to ca->ref, regardless of whether we succeed
1611 percpu_ref_put(&ca
->ref
);
1613 if (!bch2_dev_state_allowed(c
, ca
, BCH_MEMBER_STATE_failed
, flags
)) {
1614 bch_err(ca
, "Cannot remove without losing data");
1615 ret
= -BCH_ERR_device_state_not_allowed
;
1619 __bch2_dev_read_only(c
, ca
);
1621 ret
= bch2_dev_data_drop(c
, ca
->dev_idx
, flags
);
1622 bch_err_msg(ca
, ret
, "bch2_dev_data_drop()");
1626 ret
= bch2_dev_remove_alloc(c
, ca
);
1627 bch_err_msg(ca
, ret
, "bch2_dev_remove_alloc()");
1631 ret
= bch2_journal_flush_device_pins(&c
->journal
, ca
->dev_idx
);
1632 bch_err_msg(ca
, ret
, "bch2_journal_flush_device_pins()");
1636 ret
= bch2_journal_flush(&c
->journal
);
1637 bch_err_msg(ca
, ret
, "bch2_journal_flush()");
1641 ret
= bch2_replicas_gc2(c
);
1642 bch_err_msg(ca
, ret
, "bch2_replicas_gc2()");
1646 data
= bch2_dev_has_data(c
, ca
);
1648 struct printbuf data_has
= PRINTBUF
;
1650 prt_bitflags(&data_has
, __bch2_data_types
, data
);
1651 bch_err(ca
, "Remove failed, still has data (%s)", data_has
.buf
);
1652 printbuf_exit(&data_has
);
1657 __bch2_dev_offline(c
, ca
);
1659 mutex_lock(&c
->sb_lock
);
1660 rcu_assign_pointer(c
->devs
[ca
->dev_idx
], NULL
);
1661 mutex_unlock(&c
->sb_lock
);
1663 percpu_ref_kill(&ca
->ref
);
1664 wait_for_completion(&ca
->ref_completion
);
1669 * At this point the device object has been removed in-core, but the
1670 * on-disk journal might still refer to the device index via sb device
1671 * usage entries. Recovery fails if it sees usage information for an
1672 * invalid device. Flush journal pins to push the back of the journal
1673 * past now invalid device index references before we update the
1674 * superblock, but after the device object has been removed so any
1675 * further journal writes elide usage info for the device.
1677 bch2_journal_flush_all_pins(&c
->journal
);
1680 * Free this device's slot in the bch_member array - all pointers to
1681 * this device must be gone:
1683 mutex_lock(&c
->sb_lock
);
1684 m
= bch2_members_v2_get_mut(c
->disk_sb
.sb
, dev_idx
);
1685 memset(&m
->uuid
, 0, sizeof(m
->uuid
));
1687 bch2_write_super(c
);
1689 mutex_unlock(&c
->sb_lock
);
1690 up_write(&c
->state_lock
);
1692 bch2_dev_usage_journal_reserve(c
);
1695 if (ca
->mi
.state
== BCH_MEMBER_STATE_rw
&&
1696 !percpu_ref_is_zero(&ca
->io_ref
))
1697 __bch2_dev_read_write(c
, ca
);
1698 up_write(&c
->state_lock
);
1702 /* Add new device to running filesystem: */
1703 int bch2_dev_add(struct bch_fs
*c
, const char *path
)
1705 struct bch_opts opts
= bch2_opts_empty();
1706 struct bch_sb_handle sb
;
1707 struct bch_dev
*ca
= NULL
;
1708 struct bch_sb_field_members_v2
*mi
;
1709 struct bch_member dev_mi
;
1710 unsigned dev_idx
, nr_devices
, u64s
;
1711 struct printbuf errbuf
= PRINTBUF
;
1712 struct printbuf label
= PRINTBUF
;
1715 ret
= bch2_read_super(path
, &opts
, &sb
);
1716 bch_err_msg(c
, ret
, "reading super");
1720 dev_mi
= bch2_sb_member_get(sb
.sb
, sb
.sb
->dev_idx
);
1722 if (BCH_MEMBER_GROUP(&dev_mi
)) {
1723 bch2_disk_path_to_text_sb(&label
, sb
.sb
, BCH_MEMBER_GROUP(&dev_mi
) - 1);
1724 if (label
.allocation_failure
) {
1730 ret
= bch2_dev_may_add(sb
.sb
, c
);
1734 ca
= __bch2_dev_alloc(c
, &dev_mi
);
1740 bch2_dev_usage_init(ca
);
1742 ret
= __bch2_dev_attach_bdev(ca
, &sb
);
1746 ret
= bch2_dev_journal_alloc(ca
);
1747 bch_err_msg(c
, ret
, "allocating journal");
1751 down_write(&c
->state_lock
);
1752 mutex_lock(&c
->sb_lock
);
1754 ret
= bch2_sb_from_fs(c
, ca
);
1755 bch_err_msg(c
, ret
, "setting up new superblock");
1759 if (dynamic_fault("bcachefs:add:no_slot"))
1762 for (dev_idx
= 0; dev_idx
< BCH_SB_MEMBERS_MAX
; dev_idx
++)
1763 if (!bch2_dev_exists(c
->disk_sb
.sb
, dev_idx
))
1766 ret
= -BCH_ERR_ENOSPC_sb_members
;
1767 bch_err_msg(c
, ret
, "setting up new superblock");
1771 nr_devices
= max_t(unsigned, dev_idx
+ 1, c
->sb
.nr_devices
);
1773 mi
= bch2_sb_field_get(c
->disk_sb
.sb
, members_v2
);
1774 u64s
= DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2
) +
1775 le16_to_cpu(mi
->member_bytes
) * nr_devices
, sizeof(u64
));
1777 mi
= bch2_sb_field_resize(&c
->disk_sb
, members_v2
, u64s
);
1779 ret
= -BCH_ERR_ENOSPC_sb_members
;
1780 bch_err_msg(c
, ret
, "setting up new superblock");
1783 struct bch_member
*m
= bch2_members_v2_get_mut(c
->disk_sb
.sb
, dev_idx
);
1788 m
->last_mount
= cpu_to_le64(ktime_get_real_seconds());
1789 c
->disk_sb
.sb
->nr_devices
= nr_devices
;
1791 ca
->disk_sb
.sb
->dev_idx
= dev_idx
;
1792 bch2_dev_attach(c
, ca
, dev_idx
);
1794 if (BCH_MEMBER_GROUP(&dev_mi
)) {
1795 ret
= __bch2_dev_group_set(c
, ca
, label
.buf
);
1796 bch_err_msg(c
, ret
, "creating new label");
1801 bch2_write_super(c
);
1802 mutex_unlock(&c
->sb_lock
);
1804 bch2_dev_usage_journal_reserve(c
);
1806 ret
= bch2_trans_mark_dev_sb(c
, ca
);
1807 bch_err_msg(ca
, ret
, "marking new superblock");
1811 ret
= bch2_fs_freespace_init(c
);
1812 bch_err_msg(ca
, ret
, "initializing free space");
1816 ca
->new_fs_bucket_idx
= 0;
1818 if (ca
->mi
.state
== BCH_MEMBER_STATE_rw
)
1819 __bch2_dev_read_write(c
, ca
);
1821 up_write(&c
->state_lock
);
1825 mutex_unlock(&c
->sb_lock
);
1826 up_write(&c
->state_lock
);
1830 bch2_free_super(&sb
);
1831 printbuf_exit(&label
);
1832 printbuf_exit(&errbuf
);
1836 up_write(&c
->state_lock
);
1841 /* Hot add existing device to running filesystem: */
1842 int bch2_dev_online(struct bch_fs
*c
, const char *path
)
1844 struct bch_opts opts
= bch2_opts_empty();
1845 struct bch_sb_handle sb
= { NULL
};
1850 down_write(&c
->state_lock
);
1852 ret
= bch2_read_super(path
, &opts
, &sb
);
1854 up_write(&c
->state_lock
);
1858 dev_idx
= sb
.sb
->dev_idx
;
1860 ret
= bch2_dev_in_fs(&c
->disk_sb
, &sb
, &c
->opts
);
1861 bch_err_msg(c
, ret
, "bringing %s online", path
);
1865 ret
= bch2_dev_attach_bdev(c
, &sb
);
1869 ca
= bch_dev_locked(c
, dev_idx
);
1871 ret
= bch2_trans_mark_dev_sb(c
, ca
);
1872 bch_err_msg(c
, ret
, "bringing %s online: error from bch2_trans_mark_dev_sb", path
);
1876 if (ca
->mi
.state
== BCH_MEMBER_STATE_rw
)
1877 __bch2_dev_read_write(c
, ca
);
1879 if (!ca
->mi
.freespace_initialized
) {
1880 ret
= bch2_dev_freespace_init(c
, ca
, 0, ca
->mi
.nbuckets
);
1881 bch_err_msg(ca
, ret
, "initializing free space");
1886 if (!ca
->journal
.nr
) {
1887 ret
= bch2_dev_journal_alloc(ca
);
1888 bch_err_msg(ca
, ret
, "allocating journal");
1893 mutex_lock(&c
->sb_lock
);
1894 bch2_members_v2_get_mut(c
->disk_sb
.sb
, ca
->dev_idx
)->last_mount
=
1895 cpu_to_le64(ktime_get_real_seconds());
1896 bch2_write_super(c
);
1897 mutex_unlock(&c
->sb_lock
);
1899 up_write(&c
->state_lock
);
1902 up_write(&c
->state_lock
);
1903 bch2_free_super(&sb
);
1907 int bch2_dev_offline(struct bch_fs
*c
, struct bch_dev
*ca
, int flags
)
1909 down_write(&c
->state_lock
);
1911 if (!bch2_dev_is_online(ca
)) {
1912 bch_err(ca
, "Already offline");
1913 up_write(&c
->state_lock
);
1917 if (!bch2_dev_state_allowed(c
, ca
, BCH_MEMBER_STATE_failed
, flags
)) {
1918 bch_err(ca
, "Cannot offline required disk");
1919 up_write(&c
->state_lock
);
1920 return -BCH_ERR_device_state_not_allowed
;
1923 __bch2_dev_offline(c
, ca
);
1925 up_write(&c
->state_lock
);
1929 int bch2_dev_resize(struct bch_fs
*c
, struct bch_dev
*ca
, u64 nbuckets
)
1931 struct bch_member
*m
;
1935 down_write(&c
->state_lock
);
1936 old_nbuckets
= ca
->mi
.nbuckets
;
1938 if (nbuckets
< ca
->mi
.nbuckets
) {
1939 bch_err(ca
, "Cannot shrink yet");
1944 if (bch2_dev_is_online(ca
) &&
1945 get_capacity(ca
->disk_sb
.bdev
->bd_disk
) <
1946 ca
->mi
.bucket_size
* nbuckets
) {
1947 bch_err(ca
, "New size larger than device");
1948 ret
= -BCH_ERR_device_size_too_small
;
1952 ret
= bch2_dev_buckets_resize(c
, ca
, nbuckets
);
1953 bch_err_msg(ca
, ret
, "resizing buckets");
1957 ret
= bch2_trans_mark_dev_sb(c
, ca
);
1961 mutex_lock(&c
->sb_lock
);
1962 m
= bch2_members_v2_get_mut(c
->disk_sb
.sb
, ca
->dev_idx
);
1963 m
->nbuckets
= cpu_to_le64(nbuckets
);
1965 bch2_write_super(c
);
1966 mutex_unlock(&c
->sb_lock
);
1968 if (ca
->mi
.freespace_initialized
) {
1969 ret
= bch2_dev_freespace_init(c
, ca
, old_nbuckets
, nbuckets
);
1974 * XXX: this is all wrong transactionally - we'll be able to do
1975 * this correctly after the disk space accounting rewrite
1977 ca
->usage_base
->d
[BCH_DATA_free
].buckets
+= nbuckets
- old_nbuckets
;
1980 bch2_recalc_capacity(c
);
1982 up_write(&c
->state_lock
);
1986 /* return with ref on ca->ref: */
1987 struct bch_dev
*bch2_dev_lookup(struct bch_fs
*c
, const char *name
)
1990 for_each_member_device_rcu(c
, ca
, NULL
)
1991 if (!strcmp(name
, ca
->name
)) {
1996 return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found
);
1999 /* Filesystem open: */
2001 static inline int sb_cmp(struct bch_sb
*l
, struct bch_sb
*r
)
2003 return cmp_int(le64_to_cpu(l
->seq
), le64_to_cpu(r
->seq
)) ?:
2004 cmp_int(le64_to_cpu(l
->write_time
), le64_to_cpu(r
->write_time
));
2007 struct bch_fs
*bch2_fs_open(char * const *devices
, unsigned nr_devices
,
2008 struct bch_opts opts
)
2010 DARRAY(struct bch_sb_handle
) sbs
= { 0 };
2011 struct bch_fs
*c
= NULL
;
2012 struct bch_sb_handle
*best
= NULL
;
2013 struct printbuf errbuf
= PRINTBUF
;
2016 if (!try_module_get(THIS_MODULE
))
2017 return ERR_PTR(-ENODEV
);
2024 ret
= darray_make_room(&sbs
, nr_devices
);
2028 for (unsigned i
= 0; i
< nr_devices
; i
++) {
2029 struct bch_sb_handle sb
= { NULL
};
2031 ret
= bch2_read_super(devices
[i
], &opts
, &sb
);
2035 BUG_ON(darray_push(&sbs
, sb
));
2038 if (opts
.nochanges
&& !opts
.read_only
) {
2039 ret
= -BCH_ERR_erofs_nochanges
;
2043 darray_for_each(sbs
, sb
)
2044 if (!best
|| sb_cmp(sb
->sb
, best
->sb
) > 0)
2047 darray_for_each_reverse(sbs
, sb
) {
2048 ret
= bch2_dev_in_fs(best
, sb
, &opts
);
2050 if (ret
== -BCH_ERR_device_has_been_removed
||
2051 ret
== -BCH_ERR_device_splitbrain
) {
2052 bch2_free_super(sb
);
2053 darray_remove_item(&sbs
, sb
);
2063 c
= bch2_fs_alloc(best
->sb
, opts
);
2064 ret
= PTR_ERR_OR_ZERO(c
);
2068 down_write(&c
->state_lock
);
2069 darray_for_each(sbs
, sb
) {
2070 ret
= bch2_dev_attach_bdev(c
, sb
);
2072 up_write(&c
->state_lock
);
2076 up_write(&c
->state_lock
);
2078 if (!bch2_fs_may_start(c
)) {
2079 ret
= -BCH_ERR_insufficient_devices_to_start
;
2083 if (!c
->opts
.nostart
) {
2084 ret
= bch2_fs_start(c
);
2089 darray_for_each(sbs
, sb
)
2090 bch2_free_super(sb
);
2092 printbuf_exit(&errbuf
);
2093 module_put(THIS_MODULE
);
2096 pr_err("bch_fs_open err opening %s: %s",
2097 devices
[0], bch2_err_str(ret
));
2099 if (!IS_ERR_OR_NULL(c
))
2105 /* Global interfaces/init */
2107 static void bcachefs_exit(void)
2111 bch2_chardev_exit();
2112 bch2_btree_key_cache_exit();
2114 kset_unregister(bcachefs_kset
);
2117 static int __init
bcachefs_init(void)
2119 bch2_bkey_pack_test();
2121 if (!(bcachefs_kset
= kset_create_and_add("bcachefs", NULL
, fs_kobj
)) ||
2122 bch2_btree_key_cache_init() ||
2123 bch2_chardev_init() ||
2134 #define BCH_DEBUG_PARAM(name, description) \
2136 module_param_named(name, bch2_##name, bool, 0644); \
2137 MODULE_PARM_DESC(name, description);
2139 #undef BCH_DEBUG_PARAM
2142 static unsigned bch2_metadata_version
= bcachefs_metadata_version_current
;
2143 module_param_named(version
, bch2_metadata_version
, uint
, 0400);
2145 module_exit(bcachefs_exit
);
2146 module_init(bcachefs_init
);