git.ipfire.org Git - thirdparty/linux.git/blob

1 // SPDX-License-Identifier: GPL-2.0

2 /*

3 * bcachefs setup/teardown code, and some metadata io - read a superblock and

4 * figure out what to do with it.

5 *

8 */

10 #include "bcachefs.h"

11 #include "alloc_background.h"

12 #include "alloc_foreground.h"

13 #include "bkey_sort.h"

14 #include "btree_cache.h"

15 #include "btree_gc.h"

16 #include "btree_journal_iter.h"

17 #include "btree_key_cache.h"

18 #include "btree_node_scan.h"

19 #include "btree_update_interior.h"

20 #include "btree_io.h"

21 #include "btree_write_buffer.h"

22 #include "buckets_waiting_for_journal.h"

23 #include "chardev.h"

24 #include "checksum.h"

25 #include "clock.h"

26 #include "compress.h"

27 #include "debug.h"

28 #include "disk_accounting.h"

29 #include "disk_groups.h"

30 #include "ec.h"

31 #include "errcode.h"

32 #include "error.h"

33 #include "fs.h"

34 #include "fs-io.h"

35 #include "fs-io-buffered.h"

36 #include "fs-io-direct.h"

37 #include "fsck.h"

38 #include "inode.h"

39 #include "io_read.h"

40 #include "io_write.h"

41 #include "journal.h"

42 #include "journal_reclaim.h"

43 #include "journal_seq_blacklist.h"

44 #include "move.h"

45 #include "migrate.h"

46 #include "movinggc.h"

47 #include "nocow_locking.h"

48 #include "quota.h"

49 #include "rebalance.h"

50 #include "recovery.h"

51 #include "replicas.h"

52 #include "sb-clean.h"

53 #include "sb-counters.h"

54 #include "sb-errors.h"

55 #include "sb-members.h"

56 #include "snapshot.h"

57 #include "subvolume.h"

58 #include "super.h"

59 #include "super-io.h"

60 #include "sysfs.h"

61 #include "thread_with_file.h"

62 #include "trace.h"

64 #include <linux/backing-dev.h>

65 #include <linux/blkdev.h>

66 #include <linux/debugfs.h>

67 #include <linux/device.h>

68 #include <linux/idr.h>

69 #include <linux/module.h>

70 #include <linux/percpu.h>

71 #include <linux/random.h>

72 #include <linux/sysfs.h>

73 #include <crypto/hash.h>

75 MODULE_LICENSE("GPL");

76 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");

77 MODULE_DESCRIPTION("bcachefs filesystem");

78 MODULE_SOFTDEP("pre: crc32c");

79 MODULE_SOFTDEP("pre: crc64");

80 MODULE_SOFTDEP("pre: sha256");

81 MODULE_SOFTDEP("pre: chacha20");

82 MODULE_SOFTDEP("pre: poly1305");

83 MODULE_SOFTDEP("pre: xxhash");

85 const char * const bch2_fs_flag_strs[] = {

86 #define x(n) #n,

87 BCH_FS_FLAGS()

88 #undef x

89 NULL

90 };

92 void bch2_print_str(struct bch_fs *c, const char *str)

93 {

94 #ifdef __KERNEL__

95 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);

97 if (unlikely(stdio)) {

98 bch2_stdio_redirect_printf(stdio, true, "%s", str);

99 return;

100 }

101 #endif

102 bch2_print_string_as_lines(KERN_ERR, str);

103 }

104

105 __printf(2, 0)

106 static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)

107 {

108 #ifdef __KERNEL__

109 if (unlikely(stdio)) {

110 if (fmt[0] == KERN_SOH[0])

111 fmt += 2;

112

113 bch2_stdio_redirect_vprintf(stdio, true, fmt, args);

114 return;

115 }

116 #endif

117 vprintk(fmt, args);

118 }

119

120 void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)

121 {

122 struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;

123

124 va_list args;

125 va_start(args, fmt);

126 bch2_print_maybe_redirect(stdio, fmt, args);

127 va_end(args);

128 }

129

130 void __bch2_print(struct bch_fs *c, const char *fmt, ...)

131 {

132 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);

133

134 va_list args;

135 va_start(args, fmt);

136 bch2_print_maybe_redirect(stdio, fmt, args);

137 va_end(args);

138 }

139

140 #define KTYPE(type) \

141 static const struct attribute_group type ## _group = { \

142 .attrs = type ## _files \

143 }; \

144 \

145 static const struct attribute_group *type ## _groups[] = { \

146 &type ## _group, \

147 NULL \

148 }; \

149 \

150 static const struct kobj_type type ## _ktype = { \

151 .release = type ## _release, \

152 .sysfs_ops = &type ## _sysfs_ops, \

153 .default_groups = type ## _groups \

154 }

155

156 static void bch2_fs_release(struct kobject *);

157 static void bch2_dev_release(struct kobject *);

158 static void bch2_fs_counters_release(struct kobject *k)

159 {

160 }

161

162 static void bch2_fs_internal_release(struct kobject *k)

163 {

164 }

165

166 static void bch2_fs_opts_dir_release(struct kobject *k)

167 {

168 }

169

170 static void bch2_fs_time_stats_release(struct kobject *k)

171 {

172 }

173

174 KTYPE(bch2_fs);

175 KTYPE(bch2_fs_counters);

176 KTYPE(bch2_fs_internal);

177 KTYPE(bch2_fs_opts_dir);

178 KTYPE(bch2_fs_time_stats);

179 KTYPE(bch2_dev);

180

181 static struct kset *bcachefs_kset;

182 static LIST_HEAD(bch_fs_list);

183 static DEFINE_MUTEX(bch_fs_list_lock);

184

185 DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);

186

187 static void bch2_dev_unlink(struct bch_dev *);

188 static void bch2_dev_free(struct bch_dev *);

189 static int bch2_dev_alloc(struct bch_fs *, unsigned);

190 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);

191 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);

192

193 struct bch_fs *bch2_dev_to_fs(dev_t dev)

194 {

195 struct bch_fs *c;

196

197 mutex_lock(&bch_fs_list_lock);

198 rcu_read_lock();

199

200 list_for_each_entry(c, &bch_fs_list, list)

201 for_each_member_device_rcu(c, ca, NULL)

202 if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {

203 closure_get(&c->cl);

204 goto found;

205 }

206 c = NULL;

207 found:

208 rcu_read_unlock();

209 mutex_unlock(&bch_fs_list_lock);

210

211 return c;

212 }

213

214 static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)

215 {

216 struct bch_fs *c;

217

218 lockdep_assert_held(&bch_fs_list_lock);

219

220 list_for_each_entry(c, &bch_fs_list, list)

221 if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))

222 return c;

223

224 return NULL;

225 }

226

227 struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)

228 {

229 struct bch_fs *c;

230

231 mutex_lock(&bch_fs_list_lock);

232 c = __bch2_uuid_to_fs(uuid);

233 if (c)

234 closure_get(&c->cl);

235 mutex_unlock(&bch_fs_list_lock);

236

237 return c;

238 }

239

240 /* Filesystem RO/RW: */

241

242 /*

243 * For startup/shutdown of RW stuff, the dependencies are:

244 *

245 * - foreground writes depend on copygc and rebalance (to free up space)

246 *

247 * - copygc and rebalance depend on mark and sweep gc (they actually probably

248 * don't because they either reserve ahead of time or don't block if

249 * allocations fail, but allocations can require mark and sweep gc to run

250 * because of generation number wraparound)

251 *

252 * - all of the above depends on the allocator threads

253 *

254 * - allocator depends on the journal (when it rewrites prios and gens)

255 */

256

257 static void __bch2_fs_read_only(struct bch_fs *c)

258 {

259 unsigned clean_passes = 0;

260 u64 seq = 0;

261

262 bch2_fs_ec_stop(c);

263 bch2_open_buckets_stop(c, NULL, true);

264 bch2_rebalance_stop(c);

265 bch2_copygc_stop(c);

266 bch2_fs_ec_flush(c);

267

268 bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",

269 journal_cur_seq(&c->journal));

270

271 do {

272 clean_passes++;

273

274 if (bch2_btree_interior_updates_flush(c) ||

275 bch2_btree_write_buffer_flush_going_ro(c) ||

276 bch2_journal_flush_all_pins(&c->journal) ||

277 bch2_btree_flush_all_writes(c) ||

278 seq != atomic64_read(&c->journal.seq)) {

279 seq = atomic64_read(&c->journal.seq);

280 clean_passes = 0;

281 }

282 } while (clean_passes < 2);

283

284 bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",

285 journal_cur_seq(&c->journal));

286

287 if (test_bit(JOURNAL_replay_done, &c->journal.flags) &&

288 !test_bit(BCH_FS_emergency_ro, &c->flags))

289 set_bit(BCH_FS_clean_shutdown, &c->flags);

290

291 bch2_fs_journal_stop(&c->journal);

292

293 bch_info(c, "%sclean shutdown complete, journal seq %llu",

294 test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",

295 c->journal.seq_ondisk);

296

297 /*

298 * After stopping journal:

299 */

300 for_each_member_device(c, ca)

301 bch2_dev_allocator_remove(c, ca);

302 }

303

304 #ifndef BCH_WRITE_REF_DEBUG

305 static void bch2_writes_disabled(struct percpu_ref *writes)

306 {

307 struct bch_fs *c = container_of(writes, struct bch_fs, writes);

308

309 set_bit(BCH_FS_write_disable_complete, &c->flags);

310 wake_up(&bch2_read_only_wait);

311 }

312 #endif

313

314 void bch2_fs_read_only(struct bch_fs *c)

315 {

316 if (!test_bit(BCH_FS_rw, &c->flags)) {

317 bch2_journal_reclaim_stop(&c->journal);

318 return;

319 }

320

321 BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags));

322

323 bch_verbose(c, "going read-only");

324

325 /*

326 * Block new foreground-end write operations from starting - any new

327 * writes will return -EROFS:

328 */

329 set_bit(BCH_FS_going_ro, &c->flags);

330 #ifndef BCH_WRITE_REF_DEBUG

331 percpu_ref_kill(&c->writes);

332 #else

333 for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)

334 bch2_write_ref_put(c, i);

335 #endif

336

337 /*

338 * If we're not doing an emergency shutdown, we want to wait on

339 * outstanding writes to complete so they don't see spurious errors due

340 * to shutting down the allocator:

341 *

342 * If we are doing an emergency shutdown outstanding writes may

343 * hang until we shutdown the allocator so we don't want to wait

344 * on outstanding writes before shutting everything down - but

345 * we do need to wait on them before returning and signalling

346 * that going RO is complete:

347 */

348 wait_event(bch2_read_only_wait,

349 test_bit(BCH_FS_write_disable_complete, &c->flags) ||

350 test_bit(BCH_FS_emergency_ro, &c->flags));

351

352 bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags);

353 if (writes_disabled)

354 bch_verbose(c, "finished waiting for writes to stop");

355

356 __bch2_fs_read_only(c);

357

358 wait_event(bch2_read_only_wait,

359 test_bit(BCH_FS_write_disable_complete, &c->flags));

360

361 if (!writes_disabled)

362 bch_verbose(c, "finished waiting for writes to stop");

363

364 clear_bit(BCH_FS_write_disable_complete, &c->flags);

365 clear_bit(BCH_FS_going_ro, &c->flags);

366 clear_bit(BCH_FS_rw, &c->flags);

367

368 if (!bch2_journal_error(&c->journal) &&

369 !test_bit(BCH_FS_error, &c->flags) &&

370 !test_bit(BCH_FS_emergency_ro, &c->flags) &&

371 test_bit(BCH_FS_started, &c->flags) &&

372 test_bit(BCH_FS_clean_shutdown, &c->flags) &&

373 c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {

374 BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));

375 BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));

376 BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));

377 BUG_ON(c->btree_write_buffer.inc.keys.nr);

378 BUG_ON(c->btree_write_buffer.flushing.keys.nr);

379 bch2_verify_accounting_clean(c);

380

381 bch_verbose(c, "marking filesystem clean");

382 bch2_fs_mark_clean(c);

383 } else {

384 bch_verbose(c, "done going read-only, filesystem not clean");

385 }

386 }

387

388 static void bch2_fs_read_only_work(struct work_struct *work)

389 {

390 struct bch_fs *c =

391 container_of(work, struct bch_fs, read_only_work);

392

393 down_write(&c->state_lock);

394 bch2_fs_read_only(c);

395 up_write(&c->state_lock);

396 }

397

398 static void bch2_fs_read_only_async(struct bch_fs *c)

399 {

400 queue_work(system_long_wq, &c->read_only_work);

401 }

402

403 bool bch2_fs_emergency_read_only(struct bch_fs *c)

404 {

405 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);

406

407 bch2_journal_halt(&c->journal);

408 bch2_fs_read_only_async(c);

409

410 wake_up(&bch2_read_only_wait);

411 return ret;

412 }

413

414 static int bch2_fs_read_write_late(struct bch_fs *c)

415 {

416 int ret;

417

418 /*

419 * Data move operations can't run until after check_snapshots has

420 * completed, and bch2_snapshot_is_ancestor() is available.

421 *

422 * Ideally we'd start copygc/rebalance earlier instead of waiting for

423 * all of recovery/fsck to complete:

424 */

425 ret = bch2_copygc_start(c);

426 if (ret) {

427 bch_err(c, "error starting copygc thread");

428 return ret;

429 }

430

431 ret = bch2_rebalance_start(c);

432 if (ret) {

433 bch_err(c, "error starting rebalance thread");

434 return ret;

435 }

436

437 return 0;

438 }

439

440 static int __bch2_fs_read_write(struct bch_fs *c, bool early)

441 {

442 int ret;

443

444 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));

445

446 if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {

447 bch_err(c, "cannot go rw, unfixed btree errors");

448 return -BCH_ERR_erofs_unfixed_errors;

449 }

450

451 if (test_bit(BCH_FS_rw, &c->flags))

452 return 0;

453

454 bch_info(c, "going read-write");

455

456 ret = bch2_sb_members_v2_init(c);

457 if (ret)

458 goto err;

459

460 ret = bch2_fs_mark_dirty(c);

461 if (ret)

462 goto err;

463

464 clear_bit(BCH_FS_clean_shutdown, &c->flags);

465

466 /*

467 * First journal write must be a flush write: after a clean shutdown we

468 * don't read the journal, so the first journal write may end up

469 * overwriting whatever was there previously, and there must always be

470 * at least one non-flush write in the journal or recovery will fail:

471 */

472 set_bit(JOURNAL_need_flush_write, &c->journal.flags);

473 set_bit(JOURNAL_running, &c->journal.flags);

474

475 for_each_rw_member(c, ca)

476 bch2_dev_allocator_add(c, ca);

477 bch2_recalc_capacity(c);

478

479 set_bit(BCH_FS_rw, &c->flags);

480 set_bit(BCH_FS_was_rw, &c->flags);

481

482 #ifndef BCH_WRITE_REF_DEBUG

483 percpu_ref_reinit(&c->writes);

484 #else

485 for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {

486 BUG_ON(atomic_long_read(&c->writes[i]));

487 atomic_long_inc(&c->writes[i]);

488 }

489 #endif

490

491 ret = bch2_journal_reclaim_start(&c->journal);

492 if (ret)

493 goto err;

494

495 if (!early) {

496 ret = bch2_fs_read_write_late(c);

497 if (ret)

498 goto err;

499 }

500

501 bch2_do_discards(c);

502 bch2_do_invalidates(c);

503 bch2_do_stripe_deletes(c);

504 bch2_do_pending_node_rewrites(c);

505 return 0;

506 err:

507 if (test_bit(BCH_FS_rw, &c->flags))

508 bch2_fs_read_only(c);

509 else

510 __bch2_fs_read_only(c);

511 return ret;

512 }

513

514 int bch2_fs_read_write(struct bch_fs *c)

515 {

516 if (c->opts.recovery_pass_last &&

517 c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)

518 return -BCH_ERR_erofs_norecovery;

519

520 if (c->opts.nochanges)

521 return -BCH_ERR_erofs_nochanges;

522

523 return __bch2_fs_read_write(c, false);

524 }

525

526 int bch2_fs_read_write_early(struct bch_fs *c)

527 {

528 lockdep_assert_held(&c->state_lock);

529

530 return __bch2_fs_read_write(c, true);

531 }

532

533 /* Filesystem startup/shutdown: */

534

535 static void __bch2_fs_free(struct bch_fs *c)

536 {

537 for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)

538 bch2_time_stats_exit(&c->times[i]);

539

540 bch2_find_btree_nodes_exit(&c->found_btree_nodes);

541 bch2_free_pending_node_rewrites(c);

542 bch2_fs_accounting_exit(c);

543 bch2_fs_sb_errors_exit(c);

544 bch2_fs_counters_exit(c);

545 bch2_fs_snapshots_exit(c);

546 bch2_fs_quota_exit(c);

547 bch2_fs_fs_io_direct_exit(c);

548 bch2_fs_fs_io_buffered_exit(c);

549 bch2_fs_fsio_exit(c);

550 bch2_fs_vfs_exit(c);

551 bch2_fs_ec_exit(c);

552 bch2_fs_encryption_exit(c);

553 bch2_fs_nocow_locking_exit(c);

554 bch2_fs_io_write_exit(c);

555 bch2_fs_io_read_exit(c);

556 bch2_fs_buckets_waiting_for_journal_exit(c);

557 bch2_fs_btree_interior_update_exit(c);

558 bch2_fs_btree_key_cache_exit(&c->btree_key_cache);

559 bch2_fs_btree_cache_exit(c);

560 bch2_fs_btree_iter_exit(c);

561 bch2_fs_replicas_exit(c);

562 bch2_fs_journal_exit(&c->journal);

563 bch2_io_clock_exit(&c->io_clock[WRITE]);

564 bch2_io_clock_exit(&c->io_clock[READ]);

565 bch2_fs_compress_exit(c);

566 bch2_journal_keys_put_initial(c);

567 bch2_find_btree_nodes_exit(&c->found_btree_nodes);

568 BUG_ON(atomic_read(&c->journal_keys.ref));

569 bch2_fs_btree_write_buffer_exit(c);

570 percpu_free_rwsem(&c->mark_lock);

571 if (c->online_reserved) {

572 u64 v = percpu_u64_get(c->online_reserved);

573 WARN(v, "online_reserved not 0 at shutdown: %lli", v);

574 free_percpu(c->online_reserved);

575 }

576

577 darray_exit(&c->btree_roots_extra);

578 free_percpu(c->pcpu);

579 free_percpu(c->usage);

580 mempool_exit(&c->large_bkey_pool);

581 mempool_exit(&c->btree_bounce_pool);

582 bioset_exit(&c->btree_bio);

583 mempool_exit(&c->fill_iter);

584 #ifndef BCH_WRITE_REF_DEBUG

585 percpu_ref_exit(&c->writes);

586 #endif

587 kfree(rcu_dereference_protected(c->disk_groups, 1));

588 kfree(c->journal_seq_blacklist_table);

589 kfree(c->unused_inode_hints);

590

591 if (c->write_ref_wq)

592 destroy_workqueue(c->write_ref_wq);

593 if (c->btree_write_submit_wq)

594 destroy_workqueue(c->btree_write_submit_wq);

595 if (c->btree_read_complete_wq)

596 destroy_workqueue(c->btree_read_complete_wq);

597 if (c->copygc_wq)

598 destroy_workqueue(c->copygc_wq);

599 if (c->btree_io_complete_wq)

600 destroy_workqueue(c->btree_io_complete_wq);

601 if (c->btree_update_wq)

602 destroy_workqueue(c->btree_update_wq);

603

604 bch2_free_super(&c->disk_sb);

605 kvfree(c);

606 module_put(THIS_MODULE);

607 }

608

609 static void bch2_fs_release(struct kobject *kobj)

610 {

611 struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);

612

613 __bch2_fs_free(c);

614 }

615

616 void __bch2_fs_stop(struct bch_fs *c)

617 {

618 bch_verbose(c, "shutting down");

619

620 set_bit(BCH_FS_stopping, &c->flags);

621

622 down_write(&c->state_lock);

623 bch2_fs_read_only(c);

624 up_write(&c->state_lock);

625

626 for_each_member_device(c, ca)

627 bch2_dev_unlink(ca);

628

629 if (c->kobj.state_in_sysfs)

630 kobject_del(&c->kobj);

631

632 bch2_fs_debug_exit(c);

633 bch2_fs_chardev_exit(c);

634

635 bch2_ro_ref_put(c);

636 wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref));

637

638 kobject_put(&c->counters_kobj);

639 kobject_put(&c->time_stats);

640 kobject_put(&c->opts_dir);

641 kobject_put(&c->internal);

642

643 /* btree prefetch might have kicked off reads in the background: */

644 bch2_btree_flush_all_reads(c);

645

646 for_each_member_device(c, ca)

647 cancel_work_sync(&ca->io_error_work);

648

649 cancel_work_sync(&c->read_only_work);

650 }

651

652 void bch2_fs_free(struct bch_fs *c)

653 {

654 unsigned i;

655

656 mutex_lock(&bch_fs_list_lock);

657 list_del(&c->list);

658 mutex_unlock(&bch_fs_list_lock);

659

660 closure_sync(&c->cl);

661 closure_debug_destroy(&c->cl);

662

663 for (i = 0; i < c->sb.nr_devices; i++) {

664 struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);

665

666 if (ca) {

667 EBUG_ON(atomic_long_read(&ca->ref) != 1);

668 bch2_free_super(&ca->disk_sb);

669 bch2_dev_free(ca);

670 }

671 }

672

673 bch_verbose(c, "shutdown complete");

674

675 kobject_put(&c->kobj);

676 }

677

678 void bch2_fs_stop(struct bch_fs *c)

679 {

680 __bch2_fs_stop(c);

681 bch2_fs_free(c);

682 }

683

684 static int bch2_fs_online(struct bch_fs *c)

685 {

686 int ret = 0;

687

688 lockdep_assert_held(&bch_fs_list_lock);

689

690 if (__bch2_uuid_to_fs(c->sb.uuid)) {

691 bch_err(c, "filesystem UUID already open");

692 return -EINVAL;

693 }

694

695 ret = bch2_fs_chardev_init(c);

696 if (ret) {

697 bch_err(c, "error creating character device");

698 return ret;

699 }

700

701 bch2_fs_debug_init(c);

702

703 ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:

704 kobject_add(&c->internal, &c->kobj, "internal") ?:

705 kobject_add(&c->opts_dir, &c->kobj, "options") ?:

706 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT

707 kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:

708 #endif

709 kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:

710 bch2_opts_create_sysfs_files(&c->opts_dir);

711 if (ret) {

712 bch_err(c, "error creating sysfs objects");

713 return ret;

714 }

715

716 down_write(&c->state_lock);

717

718 for_each_member_device(c, ca) {

719 ret = bch2_dev_sysfs_online(c, ca);

720 if (ret) {

721 bch_err(c, "error creating sysfs objects");

722 bch2_dev_put(ca);

723 goto err;

724 }

725 }

726

727 BUG_ON(!list_empty(&c->list));

728 list_add(&c->list, &bch_fs_list);

729 err:

730 up_write(&c->state_lock);

731 return ret;

732 }

733

734 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)

735 {

736 struct bch_fs *c;

737 struct printbuf name = PRINTBUF;

738 unsigned i, iter_size;

739 int ret = 0;

740

741 c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);

742 if (!c) {

743 c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);

744 goto out;

745 }

746

747 c->stdio = (void *)(unsigned long) opts.stdio;

748

749 __module_get(THIS_MODULE);

750

751 closure_init(&c->cl, NULL);

752

753 c->kobj.kset = bcachefs_kset;

754 kobject_init(&c->kobj, &bch2_fs_ktype);

755 kobject_init(&c->internal, &bch2_fs_internal_ktype);

756 kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);

757 kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);

758 kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);

759

760 c->minor = -1;

761 c->disk_sb.fs_sb = true;

762

763 init_rwsem(&c->state_lock);

764 mutex_init(&c->sb_lock);

765 mutex_init(&c->replicas_gc_lock);

766 mutex_init(&c->btree_root_lock);

767 INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);

768

769 refcount_set(&c->ro_ref, 1);

770 init_waitqueue_head(&c->ro_ref_wait);

771 spin_lock_init(&c->recovery_pass_lock);

772 sema_init(&c->online_fsck_mutex, 1);

773

774 init_rwsem(&c->gc_lock);

775 mutex_init(&c->gc_gens_lock);

776

777 for (i = 0; i < BCH_TIME_STAT_NR; i++)

778 bch2_time_stats_init(&c->times[i]);

779

780 bch2_fs_gc_init(c);

781 bch2_fs_copygc_init(c);

782 bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);

783 bch2_fs_btree_iter_init_early(c);

784 bch2_fs_btree_interior_update_init_early(c);

785 bch2_fs_journal_keys_init(c);

786 bch2_fs_allocator_background_init(c);

787 bch2_fs_allocator_foreground_init(c);

788 bch2_fs_rebalance_init(c);

789 bch2_fs_quota_init(c);

790 bch2_fs_ec_init_early(c);

791 bch2_fs_move_init(c);

792 bch2_fs_sb_errors_init_early(c);

793

794 INIT_LIST_HEAD(&c->list);

795

796 mutex_init(&c->bio_bounce_pages_lock);

797 mutex_init(&c->snapshot_table_lock);

798 init_rwsem(&c->snapshot_create_lock);

799

800 spin_lock_init(&c->btree_write_error_lock);

801

802 INIT_LIST_HEAD(&c->journal_iters);

803

804 INIT_LIST_HEAD(&c->fsck_error_msgs);

805 mutex_init(&c->fsck_error_msgs_lock);

806

807 seqcount_init(&c->usage_lock);

808

809 sema_init(&c->io_in_flight, 128);

810

811 INIT_LIST_HEAD(&c->vfs_inodes_list);

812 mutex_init(&c->vfs_inodes_lock);

813

814 c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];

815 c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];

816 c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];

817

818 bch2_fs_btree_cache_init_early(&c->btree_cache);

819

820 mutex_init(&c->sectors_available_lock);

821

822 ret = percpu_init_rwsem(&c->mark_lock);

823 if (ret)

824 goto err;

825

826 mutex_lock(&c->sb_lock);

827 ret = bch2_sb_to_fs(c, sb);

828 mutex_unlock(&c->sb_lock);

829

830 if (ret)

831 goto err;

832

833 pr_uuid(&name, c->sb.user_uuid.b);

834 ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;

835 if (ret)

836 goto err;

837

838 strscpy(c->name, name.buf, sizeof(c->name));

839 printbuf_exit(&name);

840

841 /* Compat: */

842 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&

843 !BCH_SB_JOURNAL_FLUSH_DELAY(sb))

844 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);

845

846 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&

847 !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))

848 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);

849

850 c->opts = bch2_opts_default;

851 ret = bch2_opts_from_sb(&c->opts, sb);

852 if (ret)

853 goto err;

854

855 bch2_opts_apply(&c->opts, opts);

856

857 c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;

858 if (c->opts.inodes_use_key_cache)

859 c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;

860 c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;

861

862 c->block_bits = ilog2(block_sectors(c));

863 c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);

864

865 if (bch2_fs_init_fault("fs_alloc")) {

866 bch_err(c, "fs_alloc fault injected");

867 ret = -EFAULT;

868 goto err;

869 }

870

871 iter_size = sizeof(struct sort_iter) +

872 (btree_blocks(c) + 1) * 2 *

873 sizeof(struct sort_iter_set);

874

875 c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));

876

877 if (!(c->btree_update_wq = alloc_workqueue("bcachefs",

878 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||

879 !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",

880 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||

881 !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",

882 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||

883 !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",

884 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||

885 !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",

886 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||

887 !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",

888 WQ_FREEZABLE, 0)) ||

889 #ifndef BCH_WRITE_REF_DEBUG

890 percpu_ref_init(&c->writes, bch2_writes_disabled,

891 PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||

892 #endif

893 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||

894 bioset_init(&c->btree_bio, 1,

895 max(offsetof(struct btree_read_bio, bio),

896 offsetof(struct btree_write_bio, wbio.bio)),

897 BIOSET_NEED_BVECS) ||

898 !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||

899 !(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||

900 !(c->online_reserved = alloc_percpu(u64)) ||

901 mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,

902 c->opts.btree_node_size) ||

903 mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||

904 !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,

905 sizeof(u64), GFP_KERNEL))) {

906 ret = -BCH_ERR_ENOMEM_fs_other_alloc;

907 goto err;

908 }

909

910 ret = bch2_fs_counters_init(c) ?:

911 bch2_fs_sb_errors_init(c) ?:

912 bch2_io_clock_init(&c->io_clock[READ]) ?:

913 bch2_io_clock_init(&c->io_clock[WRITE]) ?:

914 bch2_fs_journal_init(&c->journal) ?:

915 bch2_fs_btree_iter_init(c) ?:

916 bch2_fs_btree_cache_init(c) ?:

917 bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:

918 bch2_fs_btree_interior_update_init(c) ?:

919 bch2_fs_buckets_waiting_for_journal_init(c) ?:

920 bch2_fs_btree_write_buffer_init(c) ?:

921 bch2_fs_subvolumes_init(c) ?:

922 bch2_fs_io_read_init(c) ?:

923 bch2_fs_io_write_init(c) ?:

924 bch2_fs_nocow_locking_init(c) ?:

925 bch2_fs_encryption_init(c) ?:

926 bch2_fs_compress_init(c) ?:

927 bch2_fs_ec_init(c) ?:

928 bch2_fs_vfs_init(c) ?:

929 bch2_fs_fsio_init(c) ?:

930 bch2_fs_fs_io_buffered_init(c) ?:

931 bch2_fs_fs_io_direct_init(c);

932 if (ret)

933 goto err;

934

935 for (i = 0; i < c->sb.nr_devices; i++) {

936 if (!bch2_member_exists(c->disk_sb.sb, i))

937 continue;

938 ret = bch2_dev_alloc(c, i);

939 if (ret)

940 goto err;

941 }

942

943 bch2_journal_entry_res_resize(&c->journal,

944 &c->btree_root_journal_res,

945 BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));

946 bch2_journal_entry_res_resize(&c->journal,

947 &c->clock_journal_res,

948 (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);

949

950 mutex_lock(&bch_fs_list_lock);

951 ret = bch2_fs_online(c);

952 mutex_unlock(&bch_fs_list_lock);

953

954 if (ret)

955 goto err;

956 out:

957 return c;

958 err:

959 bch2_fs_free(c);

960 c = ERR_PTR(ret);

961 goto out;

962 }

963

964 noinline_for_stack

965 static void print_mount_opts(struct bch_fs *c)

966 {

967 enum bch_opt_id i;

968 struct printbuf p = PRINTBUF;

969 bool first = true;

970

971 prt_str(&p, "starting version ");

972 bch2_version_to_text(&p, c->sb.version);

973

974 if (c->opts.read_only) {

975 prt_str(&p, " opts=");

976 first = false;

977 prt_printf(&p, "ro");

978 }

979

980 for (i = 0; i < bch2_opts_nr; i++) {

981 const struct bch_option *opt = &bch2_opt_table[i];

982 u64 v = bch2_opt_get_by_id(&c->opts, i);

983

984 if (!(opt->flags & OPT_MOUNT))

985 continue;

986

987 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))

988 continue;

989

990 prt_str(&p, first ? " opts=" : ",");

991 first = false;

992 bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);

993 }

994

995 bch_info(c, "%s", p.buf);

996 printbuf_exit(&p);

997 }

998

999 int bch2_fs_start(struct bch_fs *c)

1000 {

1001 time64_t now = ktime_get_real_seconds();

1002 int ret;

1003

1004 print_mount_opts(c);

1005

1006 down_write(&c->state_lock);

1007

1008 BUG_ON(test_bit(BCH_FS_started, &c->flags));

1009

1010 mutex_lock(&c->sb_lock);

1011

1012 ret = bch2_sb_members_v2_init(c);

1013 if (ret) {

1014 mutex_unlock(&c->sb_lock);

1015 goto err;

1016 }

1017

1018 for_each_online_member(c, ca)

1019 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);

1020

1021 struct bch_sb_field_ext *ext =

1022 bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));

1023 mutex_unlock(&c->sb_lock);

1024

1025 if (!ext) {

1026 bch_err(c, "insufficient space in superblock for sb_field_ext");

1027 ret = -BCH_ERR_ENOSPC_sb;

1028 goto err;

1029 }

1030

1031 for_each_rw_member(c, ca)

1032 bch2_dev_allocator_add(c, ca);

1033 bch2_recalc_capacity(c);

1034

1035 c->recovery_task = current;

1036 ret = BCH_SB_INITIALIZED(c->disk_sb.sb)

1037 ? bch2_fs_recovery(c)

1038 : bch2_fs_initialize(c);

1039 c->recovery_task = NULL;

1040

1041 if (ret)

1042 goto err;

1043

1044 ret = bch2_opts_check_may_set(c);

1045 if (ret)

1046 goto err;

1047

1048 if (bch2_fs_init_fault("fs_start")) {

1049 bch_err(c, "fs_start fault injected");

1050 ret = -EINVAL;

1051 goto err;

1052 }

1053

1054 set_bit(BCH_FS_started, &c->flags);

1055

1056 if (c->opts.read_only) {

1057 bch2_fs_read_only(c);

1058 } else {

1059 ret = !test_bit(BCH_FS_rw, &c->flags)

1060 ? bch2_fs_read_write(c)

1061 : bch2_fs_read_write_late(c);

1062 if (ret)

1063 goto err;

1064 }

1065

1066 ret = 0;

1067 err:

1068 if (ret)

1069 bch_err_msg(c, ret, "starting filesystem");

1070 else

1071 bch_verbose(c, "done starting filesystem");

1072 up_write(&c->state_lock);

1073 return ret;

1074 }

1075

1076 static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)

1077 {

1078 struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);

1079

1080 if (le16_to_cpu(sb->block_size) != block_sectors(c))

1081 return -BCH_ERR_mismatched_block_size;

1082

1083 if (le16_to_cpu(m.bucket_size) <

1084 BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))

1085 return -BCH_ERR_bucket_size_too_small;

1086

1087 return 0;

1088 }

1089

1090 static int bch2_dev_in_fs(struct bch_sb_handle *fs,

1091 struct bch_sb_handle *sb,

1092 struct bch_opts *opts)

1093 {

1094 if (fs == sb)

1095 return 0;

1096

1097 if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))

1098 return -BCH_ERR_device_not_a_member_of_filesystem;

1099

1100 if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))

1101 return -BCH_ERR_device_has_been_removed;

1102

1103 if (fs->sb->block_size != sb->sb->block_size)

1104 return -BCH_ERR_mismatched_block_size;

1105

1106 if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||

1107 le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)

1108 return 0;

1109

1110 if (fs->sb->seq == sb->sb->seq &&

1111 fs->sb->write_time != sb->sb->write_time) {

1112 struct printbuf buf = PRINTBUF;

1113

1114 prt_str(&buf, "Split brain detected between ");

1115 prt_bdevname(&buf, sb->bdev);

1116 prt_str(&buf, " and ");

1117 prt_bdevname(&buf, fs->bdev);

1118 prt_char(&buf, ':');

1119 prt_newline(&buf);

1120 prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));

1121 prt_newline(&buf);

1122

1123 prt_bdevname(&buf, fs->bdev);

1124 prt_char(&buf, ' ');

1125 bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));

1126 prt_newline(&buf);

1127

1128 prt_bdevname(&buf, sb->bdev);

1129 prt_char(&buf, ' ');

1130 bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));

1131 prt_newline(&buf);

1132

1133 if (!opts->no_splitbrain_check)

1134 prt_printf(&buf, "Not using older sb");

1135

1136 pr_err("%s", buf.buf);

1137 printbuf_exit(&buf);

1138

1139 if (!opts->no_splitbrain_check)

1140 return -BCH_ERR_device_splitbrain;

1141 }

1142

1143 struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);

1144 u64 seq_from_fs = le64_to_cpu(m.seq);

1145 u64 seq_from_member = le64_to_cpu(sb->sb->seq);

1146

1147 if (seq_from_fs && seq_from_fs < seq_from_member) {

1148 struct printbuf buf = PRINTBUF;

1149

1150 prt_str(&buf, "Split brain detected between ");

1151 prt_bdevname(&buf, sb->bdev);

1152 prt_str(&buf, " and ");

1153 prt_bdevname(&buf, fs->bdev);

1154 prt_char(&buf, ':');

1155 prt_newline(&buf);

1156

1157 prt_bdevname(&buf, fs->bdev);

1158 prt_str(&buf, " believes seq of ");

1159 prt_bdevname(&buf, sb->bdev);

1160 prt_printf(&buf, " to be %llu, but ", seq_from_fs);

1161 prt_bdevname(&buf, sb->bdev);

1162 prt_printf(&buf, " has %llu\n", seq_from_member);

1163

1164 if (!opts->no_splitbrain_check) {

1165 prt_str(&buf, "Not using ");

1166 prt_bdevname(&buf, sb->bdev);

1167 }

1168

1169 pr_err("%s", buf.buf);

1170 printbuf_exit(&buf);

1171

1172 if (!opts->no_splitbrain_check)

1173 return -BCH_ERR_device_splitbrain;

1174 }

1175

1176 return 0;

1177 }

1178

1179 /* Device startup/shutdown: */

1180

1181 static void bch2_dev_release(struct kobject *kobj)

1182 {

1183 struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);

1184

1185 kfree(ca);

1186 }

1187

1188 static void bch2_dev_free(struct bch_dev *ca)

1189 {

1190 cancel_work_sync(&ca->io_error_work);

1191

1192 bch2_dev_unlink(ca);

1193

1194 if (ca->kobj.state_in_sysfs)

1195 kobject_del(&ca->kobj);

1196

1197 bch2_free_super(&ca->disk_sb);

1198 bch2_dev_allocator_background_exit(ca);

1199 bch2_dev_journal_exit(ca);

1200

1201 free_percpu(ca->io_done);

1202 bch2_dev_buckets_free(ca);

1203 kfree(ca->sb_read_scratch);

1204

1205 bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);

1206 bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);

1207

1208 percpu_ref_exit(&ca->io_ref);

1209 #ifndef CONFIG_BCACHEFS_DEBUG

1210 percpu_ref_exit(&ca->ref);

1211 #endif

1212 kobject_put(&ca->kobj);

1213 }

1214

1215 static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)

1216 {

1217

1218 lockdep_assert_held(&c->state_lock);

1219

1220 if (percpu_ref_is_zero(&ca->io_ref))

1221 return;

1222

1223 __bch2_dev_read_only(c, ca);

1224

1225 reinit_completion(&ca->io_ref_completion);

1226 percpu_ref_kill(&ca->io_ref);

1227 wait_for_completion(&ca->io_ref_completion);

1228

1229 bch2_dev_unlink(ca);

1230

1231 bch2_free_super(&ca->disk_sb);

1232 bch2_dev_journal_exit(ca);

1233 }

1234

1235 #ifndef CONFIG_BCACHEFS_DEBUG

1236 static void bch2_dev_ref_complete(struct percpu_ref *ref)

1237 {

1238 struct bch_dev *ca = container_of(ref, struct bch_dev, ref);

1239

1240 complete(&ca->ref_completion);

1241 }

1242 #endif

1243

1244 static void bch2_dev_io_ref_complete(struct percpu_ref *ref)

1245 {

1246 struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);

1247

1248 complete(&ca->io_ref_completion);

1249 }

1250

1251 static void bch2_dev_unlink(struct bch_dev *ca)

1252 {

1253 struct kobject *b;

1254

1255 /*

1256 * This is racy w.r.t. the underlying block device being hot-removed,

1257 * which removes it from sysfs.

1258 *

1259 * It'd be lovely if we had a way to handle this race, but the sysfs

1260 * code doesn't appear to provide a good method and block/holder.c is

1261 * susceptible as well:

1262 */

1263 if (ca->kobj.state_in_sysfs &&

1264 ca->disk_sb.bdev &&

1265 (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {

1266 sysfs_remove_link(b, "bcachefs");

1267 sysfs_remove_link(&ca->kobj, "block");

1268 }

1269 }

1270

1271 static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)

1272 {

1273 int ret;

1274

1275 if (!c->kobj.state_in_sysfs)

1276 return 0;

1277

1278 if (!ca->kobj.state_in_sysfs) {

1279 ret = kobject_add(&ca->kobj, &c->kobj,

1280 "dev-%u", ca->dev_idx);

1281 if (ret)

1282 return ret;

1283 }

1284

1285 if (ca->disk_sb.bdev) {

1286 struct kobject *block = bdev_kobj(ca->disk_sb.bdev);

1287

1288 ret = sysfs_create_link(block, &ca->kobj, "bcachefs");

1289 if (ret)

1290 return ret;

1291

1292 ret = sysfs_create_link(&ca->kobj, block, "block");

1293 if (ret)

1294 return ret;

1295 }

1296

1297 return 0;

1298 }

1299

1300 static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,

1301 struct bch_member *member)

1302 {

1303 struct bch_dev *ca;

1304 unsigned i;

1305

1306 ca = kzalloc(sizeof(*ca), GFP_KERNEL);

1307 if (!ca)

1308 return NULL;

1309

1310 kobject_init(&ca->kobj, &bch2_dev_ktype);

1311 init_completion(&ca->ref_completion);

1312 init_completion(&ca->io_ref_completion);

1313

1314 init_rwsem(&ca->bucket_lock);

1315

1316 INIT_WORK(&ca->io_error_work, bch2_io_error_work);

1317

1318 bch2_time_stats_quantiles_init(&ca->io_latency[READ]);

1319 bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);

1320

1321 ca->mi = bch2_mi_to_cpu(member);

1322

1323 for (i = 0; i < ARRAY_SIZE(member->errors); i++)

1324 atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));

1325

1326 ca->uuid = member->uuid;

1327

1328 ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,

1329 ca->mi.bucket_size / btree_sectors(c));

1330

1331 #ifndef CONFIG_BCACHEFS_DEBUG

1332 if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))

1333 goto err;

1334 #else

1335 atomic_long_set(&ca->ref, 1);

1336 #endif

1337

1338 bch2_dev_allocator_background_init(ca);

1339

1340 if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,

1341 PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||

1342 !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||

1343 bch2_dev_buckets_alloc(c, ca) ||

1344 !(ca->io_done = alloc_percpu(*ca->io_done)))

1345 goto err;

1346

1347 return ca;

1348 err:

1349 bch2_dev_free(ca);

1350 return NULL;

1351 }

1352

1353 static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,

1354 unsigned dev_idx)

1355 {

1356 ca->dev_idx = dev_idx;

1357 __set_bit(ca->dev_idx, ca->self.d);

1358 scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);

1359

1360 ca->fs = c;

1361 rcu_assign_pointer(c->devs[ca->dev_idx], ca);

1362

1363 if (bch2_dev_sysfs_online(c, ca))

1364 pr_warn("error creating sysfs objects");

1365 }

1366

1367 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)

1368 {

1369 struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);

1370 struct bch_dev *ca = NULL;

1371

1372 if (bch2_fs_init_fault("dev_alloc"))

1373 goto err;

1374

1375 ca = __bch2_dev_alloc(c, &member);

1376 if (!ca)

1377 goto err;

1378

1379 ca->fs = c;

1380

1381 bch2_dev_attach(c, ca, dev_idx);

1382 return 0;

1383 err:

1384 return -BCH_ERR_ENOMEM_dev_alloc;

1385 }

1386

1387 static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)

1388 {

1389 unsigned ret;

1390

1391 if (bch2_dev_is_online(ca)) {

1392 bch_err(ca, "already have device online in slot %u",

1393 sb->sb->dev_idx);

1394 return -BCH_ERR_device_already_online;

1395 }

1396

1397 if (get_capacity(sb->bdev->bd_disk) <

1398 ca->mi.bucket_size * ca->mi.nbuckets) {

1399 bch_err(ca, "cannot online: device too small");

1400 return -BCH_ERR_device_size_too_small;

1401 }

1402

1403 BUG_ON(!percpu_ref_is_zero(&ca->io_ref));

1404

1405 ret = bch2_dev_journal_init(ca, sb->sb);

1406 if (ret)

1407 return ret;

1408

1409 /* Commit: */

1410 ca->disk_sb = *sb;

1411 memset(sb, 0, sizeof(*sb));

1412

1413 ca->dev = ca->disk_sb.bdev->bd_dev;

1414

1415 percpu_ref_reinit(&ca->io_ref);

1416

1417 return 0;

1418 }

1419

1420 static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)

1421 {

1422 struct bch_dev *ca;

1423 int ret;

1424

1425 lockdep_assert_held(&c->state_lock);

1426

1427 if (le64_to_cpu(sb->sb->seq) >

1428 le64_to_cpu(c->disk_sb.sb->seq))

1429 bch2_sb_to_fs(c, sb->sb);

1430

1431 BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));

1432

1433 ca = bch2_dev_locked(c, sb->sb->dev_idx);

1434

1435 ret = __bch2_dev_attach_bdev(ca, sb);

1436 if (ret)

1437 return ret;

1438

1439 bch2_dev_sysfs_online(c, ca);

1440

1441 struct printbuf name = PRINTBUF;

1442 prt_bdevname(&name, ca->disk_sb.bdev);

1443

1444 if (c->sb.nr_devices == 1)

1445 strscpy(c->name, name.buf, sizeof(c->name));

1446 strscpy(ca->name, name.buf, sizeof(ca->name));

1447

1448 printbuf_exit(&name);

1449

1450 rebalance_wakeup(c);

1451 return 0;

1452 }

1453

1454 /* Device management: */

1455

1456 /*

1457 * Note: this function is also used by the error paths - when a particular

1458 * device sees an error, we call it to determine whether we can just set the

1459 * device RO, or - if this function returns false - we'll set the whole

1460 * filesystem RO:

1461 *

1462 * XXX: maybe we should be more explicit about whether we're changing state

1463 * because we got an error or what have you?

1464 */

1465 bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,

1466 enum bch_member_state new_state, int flags)

1467 {

1468 struct bch_devs_mask new_online_devs;

1469 int nr_rw = 0, required;

1470

1471 lockdep_assert_held(&c->state_lock);

1472

1473 switch (new_state) {

1474 case BCH_MEMBER_STATE_rw:

1475 return true;

1476 case BCH_MEMBER_STATE_ro:

1477 if (ca->mi.state != BCH_MEMBER_STATE_rw)

1478 return true;

1479

1480 /* do we have enough devices to write to? */

1481 for_each_member_device(c, ca2)

1482 if (ca2 != ca)

1483 nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;

1484

1485 required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)

1486 ? c->opts.metadata_replicas

1487 : metadata_replicas_required(c),

1488 !(flags & BCH_FORCE_IF_DATA_DEGRADED)

1489 ? c->opts.data_replicas

1490 : data_replicas_required(c));

1491

1492 return nr_rw >= required;

1493 case BCH_MEMBER_STATE_failed:

1494 case BCH_MEMBER_STATE_spare:

1495 if (ca->mi.state != BCH_MEMBER_STATE_rw &&

1496 ca->mi.state != BCH_MEMBER_STATE_ro)

1497 return true;

1498

1499 /* do we have enough devices to read from? */

1500 new_online_devs = bch2_online_devs(c);

1501 __clear_bit(ca->dev_idx, new_online_devs.d);

1502

1503 return bch2_have_enough_devs(c, new_online_devs, flags, false);

1504 default:

1505 BUG();

1506 }

1507 }

1508

1509 static bool bch2_fs_may_start(struct bch_fs *c)

1510 {

1511 struct bch_dev *ca;

1512 unsigned i, flags = 0;

1513

1514 if (c->opts.very_degraded)

1515 flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;

1516

1517 if (c->opts.degraded)

1518 flags |= BCH_FORCE_IF_DEGRADED;

1519

1520 if (!c->opts.degraded &&

1521 !c->opts.very_degraded) {

1522 mutex_lock(&c->sb_lock);

1523

1524 for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {

1525 if (!bch2_member_exists(c->disk_sb.sb, i))

1526 continue;

1527

1528 ca = bch2_dev_locked(c, i);

1529

1530 if (!bch2_dev_is_online(ca) &&

1531 (ca->mi.state == BCH_MEMBER_STATE_rw ||

1532 ca->mi.state == BCH_MEMBER_STATE_ro)) {

1533 mutex_unlock(&c->sb_lock);

1534 return false;

1535 }

1536 }

1537 mutex_unlock(&c->sb_lock);

1538 }

1539

1540 return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);

1541 }

1542

1543 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)

1544 {

1545 /*

1546 * The allocator thread itself allocates btree nodes, so stop it first:

1547 */

1548 bch2_dev_allocator_remove(c, ca);

1549 bch2_recalc_capacity(c);

1550 bch2_dev_journal_stop(&c->journal, ca);

1551 }

1552

1553 static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)

1554 {

1555 lockdep_assert_held(&c->state_lock);

1556

1557 BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);

1558

1559 bch2_dev_allocator_add(c, ca);

1560 bch2_recalc_capacity(c);

1561 bch2_dev_do_discards(ca);

1562 }

1563

1564 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,

1565 enum bch_member_state new_state, int flags)

1566 {

1567 struct bch_member *m;

1568 int ret = 0;

1569

1570 if (ca->mi.state == new_state)

1571 return 0;

1572

1573 if (!bch2_dev_state_allowed(c, ca, new_state, flags))

1574 return -BCH_ERR_device_state_not_allowed;

1575

1576 if (new_state != BCH_MEMBER_STATE_rw)

1577 __bch2_dev_read_only(c, ca);

1578

1579 bch_notice(ca, "%s", bch2_member_states[new_state]);

1580

1581 mutex_lock(&c->sb_lock);

1582 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);

1583 SET_BCH_MEMBER_STATE(m, new_state);

1584 bch2_write_super(c);

1585 mutex_unlock(&c->sb_lock);

1586

1587 if (new_state == BCH_MEMBER_STATE_rw)

1588 __bch2_dev_read_write(c, ca);

1589

1590 rebalance_wakeup(c);

1591

1592 return ret;

1593 }

1594

1595 int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,

1596 enum bch_member_state new_state, int flags)

1597 {

1598 int ret;

1599

1600 down_write(&c->state_lock);

1601 ret = __bch2_dev_set_state(c, ca, new_state, flags);

1602 up_write(&c->state_lock);

1603

1604 return ret;

1605 }

1606

1607 /* Device add/removal: */

1608

1609 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)

1610 {

1611 struct bch_member *m;

1612 unsigned dev_idx = ca->dev_idx, data;

1613 int ret;

1614

1615 down_write(&c->state_lock);

1616

1617 /*

1618 * We consume a reference to ca->ref, regardless of whether we succeed

1619 * or fail:

1620 */

1621 bch2_dev_put(ca);

1622

1623 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {

1624 bch_err(ca, "Cannot remove without losing data");

1625 ret = -BCH_ERR_device_state_not_allowed;

1626 goto err;

1627 }

1628

1629 __bch2_dev_read_only(c, ca);

1630

1631 ret = bch2_dev_data_drop(c, ca->dev_idx, flags);

1632 bch_err_msg(ca, ret, "bch2_dev_data_drop()");

1633 if (ret)

1634 goto err;

1635

1636 ret = bch2_dev_remove_alloc(c, ca);

1637 bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");

1638 if (ret)

1639 goto err;

1640

1641 /*

1642 * We need to flush the entire journal to get rid of keys that reference

1643 * the device being removed before removing the superblock entry

1644 */

1645 bch2_journal_flush_all_pins(&c->journal);

1646

1647 /*

1648 * this is really just needed for the bch2_replicas_gc_(start|end)

1649 * calls, and could be cleaned up:

1650 */

1651 ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);

1652 bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");

1653 if (ret)

1654 goto err;

1655

1656 ret = bch2_journal_flush(&c->journal);

1657 bch_err_msg(ca, ret, "bch2_journal_flush()");

1658 if (ret)

1659 goto err;

1660

1661 ret = bch2_replicas_gc2(c);

1662 bch_err_msg(ca, ret, "bch2_replicas_gc2()");

1663 if (ret)

1664 goto err;

1665

1666 data = bch2_dev_has_data(c, ca);

1667 if (data) {

1668 struct printbuf data_has = PRINTBUF;

1669

1670 prt_bitflags(&data_has, __bch2_data_types, data);

1671 bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);

1672 printbuf_exit(&data_has);

1673 ret = -EBUSY;

1674 goto err;

1675 }

1676

1677 __bch2_dev_offline(c, ca);

1678

1679 mutex_lock(&c->sb_lock);

1680 rcu_assign_pointer(c->devs[ca->dev_idx], NULL);

1681 mutex_unlock(&c->sb_lock);

1682

1683 #ifndef CONFIG_BCACHEFS_DEBUG

1684 percpu_ref_kill(&ca->ref);

1685 #else

1686 ca->dying = true;

1687 bch2_dev_put(ca);

1688 #endif

1689 wait_for_completion(&ca->ref_completion);

1690

1691 bch2_dev_free(ca);

1692

1693 /*

1694 * Free this device's slot in the bch_member array - all pointers to

1695 * this device must be gone:

1696 */

1697 mutex_lock(&c->sb_lock);

1698 m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);

1699 memset(&m->uuid, 0, sizeof(m->uuid));

1700

1701 bch2_write_super(c);

1702

1703 mutex_unlock(&c->sb_lock);

1704 up_write(&c->state_lock);

1705 return 0;

1706 err:

1707 if (ca->mi.state == BCH_MEMBER_STATE_rw &&

1708 !percpu_ref_is_zero(&ca->io_ref))

1709 __bch2_dev_read_write(c, ca);

1710 up_write(&c->state_lock);

1711 return ret;

1712 }

1713

1714 /* Add new device to running filesystem: */

1715 int bch2_dev_add(struct bch_fs *c, const char *path)

1716 {

1717 struct bch_opts opts = bch2_opts_empty();

1718 struct bch_sb_handle sb;

1719 struct bch_dev *ca = NULL;

1720 struct printbuf errbuf = PRINTBUF;

1721 struct printbuf label = PRINTBUF;

1722 int ret;

1723

1724 ret = bch2_read_super(path, &opts, &sb);

1725 bch_err_msg(c, ret, "reading super");

1726 if (ret)

1727 goto err;

1728

1729 struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);

1730

1731 if (BCH_MEMBER_GROUP(&dev_mi)) {

1732 bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);

1733 if (label.allocation_failure) {

1734 ret = -ENOMEM;

1735 goto err;

1736 }

1737 }

1738

1739 ret = bch2_dev_may_add(sb.sb, c);

1740 if (ret)

1741 goto err;

1742

1743 ca = __bch2_dev_alloc(c, &dev_mi);

1744 if (!ca) {

1745 ret = -ENOMEM;

1746 goto err;

1747 }

1748

1749 ret = __bch2_dev_attach_bdev(ca, &sb);

1750 if (ret)

1751 goto err;

1752

1753 down_write(&c->state_lock);

1754 mutex_lock(&c->sb_lock);

1755

1756 ret = bch2_sb_from_fs(c, ca);

1757 bch_err_msg(c, ret, "setting up new superblock");

1758 if (ret)

1759 goto err_unlock;

1760

1761 if (dynamic_fault("bcachefs:add:no_slot"))

1762 goto err_unlock;

1763

1764 ret = bch2_sb_member_alloc(c);

1765 if (ret < 0) {

1766 bch_err_msg(c, ret, "setting up new superblock");

1767 goto err_unlock;

1768 }

1769 unsigned dev_idx = ret;

1770

1771 /* success: */

1772

1773 dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());

1774 *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;

1775

1776 ca->disk_sb.sb->dev_idx = dev_idx;

1777 bch2_dev_attach(c, ca, dev_idx);

1778

1779 if (BCH_MEMBER_GROUP(&dev_mi)) {

1780 ret = __bch2_dev_group_set(c, ca, label.buf);

1781 bch_err_msg(c, ret, "creating new label");

1782 if (ret)

1783 goto err_unlock;

1784 }

1785

1786 bch2_write_super(c);

1787 mutex_unlock(&c->sb_lock);

1788

1789 ret = bch2_dev_usage_init(ca, false);

1790 if (ret)

1791 goto err_late;

1792

1793 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);

1794 bch_err_msg(ca, ret, "marking new superblock");

1795 if (ret)

1796 goto err_late;

1797

1798 ret = bch2_fs_freespace_init(c);

1799 bch_err_msg(ca, ret, "initializing free space");

1800 if (ret)

1801 goto err_late;

1802

1803 if (ca->mi.state == BCH_MEMBER_STATE_rw)

1804 __bch2_dev_read_write(c, ca);

1805

1806 ret = bch2_dev_journal_alloc(ca, false);

1807 bch_err_msg(c, ret, "allocating journal");

1808 if (ret)

1809 goto err_late;

1810

1811 up_write(&c->state_lock);

1812 return 0;

1813

1814 err_unlock:

1815 mutex_unlock(&c->sb_lock);

1816 up_write(&c->state_lock);

1817 err:

1818 if (ca)

1819 bch2_dev_free(ca);

1820 bch2_free_super(&sb);

1821 printbuf_exit(&label);

1822 printbuf_exit(&errbuf);

1823 bch_err_fn(c, ret);

1824 return ret;

1825 err_late:

1826 up_write(&c->state_lock);

1827 ca = NULL;

1828 goto err;

1829 }

1830

1831 /* Hot add existing device to running filesystem: */

1832 int bch2_dev_online(struct bch_fs *c, const char *path)

1833 {

1834 struct bch_opts opts = bch2_opts_empty();

1835 struct bch_sb_handle sb = { NULL };

1836 struct bch_dev *ca;

1837 unsigned dev_idx;

1838 int ret;

1839

1840 down_write(&c->state_lock);

1841

1842 ret = bch2_read_super(path, &opts, &sb);

1843 if (ret) {

1844 up_write(&c->state_lock);

1845 return ret;

1846 }

1847

1848 dev_idx = sb.sb->dev_idx;

1849

1850 ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);

1851 bch_err_msg(c, ret, "bringing %s online", path);

1852 if (ret)

1853 goto err;

1854

1855 ret = bch2_dev_attach_bdev(c, &sb);

1856 if (ret)

1857 goto err;

1858

1859 ca = bch2_dev_locked(c, dev_idx);

1860

1861 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);

1862 bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);

1863 if (ret)

1864 goto err;

1865

1866 if (ca->mi.state == BCH_MEMBER_STATE_rw)

1867 __bch2_dev_read_write(c, ca);

1868

1869 if (!ca->mi.freespace_initialized) {

1870 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);

1871 bch_err_msg(ca, ret, "initializing free space");

1872 if (ret)

1873 goto err;

1874 }

1875

1876 if (!ca->journal.nr) {

1877 ret = bch2_dev_journal_alloc(ca, false);

1878 bch_err_msg(ca, ret, "allocating journal");

1879 if (ret)

1880 goto err;

1881 }

1882

1883 mutex_lock(&c->sb_lock);

1884 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =

1885 cpu_to_le64(ktime_get_real_seconds());

1886 bch2_write_super(c);

1887 mutex_unlock(&c->sb_lock);

1888

1889 up_write(&c->state_lock);

1890 return 0;

1891 err:

1892 up_write(&c->state_lock);

1893 bch2_free_super(&sb);

1894 return ret;

1895 }

1896

1897 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)

1898 {

1899 down_write(&c->state_lock);

1900

1901 if (!bch2_dev_is_online(ca)) {

1902 bch_err(ca, "Already offline");

1903 up_write(&c->state_lock);

1904 return 0;

1905 }

1906

1907 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {

1908 bch_err(ca, "Cannot offline required disk");

1909 up_write(&c->state_lock);

1910 return -BCH_ERR_device_state_not_allowed;

1911 }

1912

1913 __bch2_dev_offline(c, ca);

1914

1915 up_write(&c->state_lock);

1916 return 0;

1917 }

1918

1919 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)

1920 {

1921 struct bch_member *m;

1922 u64 old_nbuckets;

1923 int ret = 0;

1924

1925 down_write(&c->state_lock);

1926 old_nbuckets = ca->mi.nbuckets;

1927

1928 if (nbuckets < ca->mi.nbuckets) {

1929 bch_err(ca, "Cannot shrink yet");

1930 ret = -EINVAL;

1931 goto err;

1932 }

1933

1934 if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {

1935 bch_err(ca, "New device size too big (%llu greater than max %u)",

1936 nbuckets, BCH_MEMBER_NBUCKETS_MAX);

1937 ret = -BCH_ERR_device_size_too_big;

1938 goto err;

1939 }

1940

1941 if (bch2_dev_is_online(ca) &&

1942 get_capacity(ca->disk_sb.bdev->bd_disk) <

1943 ca->mi.bucket_size * nbuckets) {

1944 bch_err(ca, "New size larger than device");

1945 ret = -BCH_ERR_device_size_too_small;

1946 goto err;

1947 }

1948

1949 ret = bch2_dev_buckets_resize(c, ca, nbuckets);

1950 bch_err_msg(ca, ret, "resizing buckets");

1951 if (ret)

1952 goto err;

1953

1954 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);

1955 if (ret)

1956 goto err;

1957

1958 mutex_lock(&c->sb_lock);

1959 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);

1960 m->nbuckets = cpu_to_le64(nbuckets);

1961

1962 bch2_write_super(c);

1963 mutex_unlock(&c->sb_lock);

1964

1965 if (ca->mi.freespace_initialized) {

1966 struct disk_accounting_pos acc = {

1967 .type = BCH_DISK_ACCOUNTING_dev_data_type,

1968 .dev_data_type.dev = ca->dev_idx,

1969 .dev_data_type.data_type = BCH_DATA_free,

1970 };

1971 u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };

1972

1973 ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,

1974 bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:

1975 bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);

1976 if (ret)

1977 goto err;

1978 }

1979

1980 bch2_recalc_capacity(c);

1981 err:

1982 up_write(&c->state_lock);

1983 return ret;

1984 }

1985

1986 /* return with ref on ca->ref: */

1987 struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)

1988 {

1989 if (!strncmp(name, "/dev/", strlen("/dev/")))

1990 name += strlen("/dev/");

1991

1992 for_each_member_device(c, ca)

1993 if (!strcmp(name, ca->name))

1994 return ca;

1995 return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);

1996 }

1997

1998 /* Filesystem open: */

1999

2000 static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)

2001 {

2002 return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:

2003 cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));

2004 }

2005

2006 struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,

2007 struct bch_opts opts)

2008 {

2009 DARRAY(struct bch_sb_handle) sbs = { 0 };

2010 struct bch_fs *c = NULL;

2011 struct bch_sb_handle *best = NULL;

2012 struct printbuf errbuf = PRINTBUF;

2013 int ret = 0;

2014

2015 if (!try_module_get(THIS_MODULE))

2016 return ERR_PTR(-ENODEV);

2017

2018 if (!nr_devices) {

2019 ret = -EINVAL;

2020 goto err;

2021 }

2022

2023 ret = darray_make_room(&sbs, nr_devices);

2024 if (ret)

2025 goto err;

2026

2027 for (unsigned i = 0; i < nr_devices; i++) {

2028 struct bch_sb_handle sb = { NULL };

2029

2030 ret = bch2_read_super(devices[i], &opts, &sb);

2031 if (ret)

2032 goto err;

2033

2034 BUG_ON(darray_push(&sbs, sb));

2035 }

2036

2037 if (opts.nochanges && !opts.read_only) {

2038 ret = -BCH_ERR_erofs_nochanges;

2039 goto err_print;

2040 }

2041

2042 darray_for_each(sbs, sb)

2043 if (!best || sb_cmp(sb->sb, best->sb) > 0)

2044 best = sb;

2045

2046 darray_for_each_reverse(sbs, sb) {

2047 ret = bch2_dev_in_fs(best, sb, &opts);

2048

2049 if (ret == -BCH_ERR_device_has_been_removed ||

2050 ret == -BCH_ERR_device_splitbrain) {

2051 bch2_free_super(sb);

2052 darray_remove_item(&sbs, sb);

2053 best -= best > sb;

2054 ret = 0;

2055 continue;

2056 }

2057

2058 if (ret)

2059 goto err_print;

2060 }

2061

2062 c = bch2_fs_alloc(best->sb, opts);

2063 ret = PTR_ERR_OR_ZERO(c);

2064 if (ret)

2065 goto err;

2066

2067 down_write(&c->state_lock);

2068 darray_for_each(sbs, sb) {

2069 ret = bch2_dev_attach_bdev(c, sb);

2070 if (ret) {

2071 up_write(&c->state_lock);

2072 goto err;

2073 }

2074 }

2075 up_write(&c->state_lock);

2076

2077 if (!bch2_fs_may_start(c)) {

2078 ret = -BCH_ERR_insufficient_devices_to_start;

2079 goto err_print;

2080 }

2081

2082 if (!c->opts.nostart) {

2083 ret = bch2_fs_start(c);

2084 if (ret)

2085 goto err;

2086 }

2087 out:

2088 darray_for_each(sbs, sb)

2089 bch2_free_super(sb);

2090 darray_exit(&sbs);

2091 printbuf_exit(&errbuf);

2092 module_put(THIS_MODULE);

2093 return c;

2094 err_print:

2095 pr_err("bch_fs_open err opening %s: %s",

2096 devices[0], bch2_err_str(ret));

2097 err:

2098 if (!IS_ERR_OR_NULL(c))

2099 bch2_fs_stop(c);

2100 c = ERR_PTR(ret);

2101 goto out;

2102 }

2103

2104 /* Global interfaces/init */

2105

2106 static void bcachefs_exit(void)

2107 {

2108 bch2_debug_exit();

2109 bch2_vfs_exit();

2110 bch2_chardev_exit();

2111 bch2_btree_key_cache_exit();

2112 if (bcachefs_kset)

2113 kset_unregister(bcachefs_kset);

2114 }

2115

2116 static int __init bcachefs_init(void)

2117 {

2118 bch2_bkey_pack_test();

2119

2120 if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||

2121 bch2_btree_key_cache_init() ||

2122 bch2_chardev_init() ||

2123 bch2_vfs_init() ||

2124 bch2_debug_init())

2125 goto err;

2126

2127 return 0;

2128 err:

2129 bcachefs_exit();

2130 return -ENOMEM;

2131 }

2132

2133 #define BCH_DEBUG_PARAM(name, description) \

2134 bool bch2_##name; \

2135 module_param_named(name, bch2_##name, bool, 0644); \

2136 MODULE_PARM_DESC(name, description);

2137 BCH_DEBUG_PARAMS()

2138 #undef BCH_DEBUG_PARAM

2139

2140 __maybe_unused

2141 static unsigned bch2_metadata_version = bcachefs_metadata_version_current;

2142 module_param_named(version, bch2_metadata_version, uint, 0400);

2143

2144 module_exit(bcachefs_exit);

2145 module_init(bcachefs_init);