]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
Remove bcachefs core code
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 29 Sep 2025 20:43:52 +0000 (13:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 29 Sep 2025 20:43:52 +0000 (13:43 -0700)
bcachefs was marked 'externally maintained' in 6.17 but the code
remained to make the transition smoother.

It's now a DKMS module, making the in-kernel code stale, so remove
it to avoid any version confusion.

Link: https://lore.kernel.org/linux-bcachefs/yokpt2d2g2lluyomtqrdvmkl3amv3kgnipmenobkpgx537kay7@xgcgjviv3n7x/T/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
284 files changed:
Documentation/filesystems/bcachefs/CodingStyle.rst [deleted file]
Documentation/filesystems/bcachefs/SubmittingPatches.rst [deleted file]
Documentation/filesystems/bcachefs/casefolding.rst [deleted file]
Documentation/filesystems/bcachefs/errorcodes.rst [deleted file]
Documentation/filesystems/bcachefs/future/idle_work.rst [deleted file]
Documentation/filesystems/bcachefs/index.rst [deleted file]
Documentation/filesystems/index.rst
MAINTAINERS
arch/m68k/configs/amiga_defconfig
arch/m68k/configs/apollo_defconfig
arch/m68k/configs/atari_defconfig
arch/m68k/configs/bvme6000_defconfig
arch/m68k/configs/hp300_defconfig
arch/m68k/configs/mac_defconfig
arch/m68k/configs/multi_defconfig
arch/m68k/configs/mvme147_defconfig
arch/m68k/configs/mvme16x_defconfig
arch/m68k/configs/q40_defconfig
arch/m68k/configs/sun3_defconfig
arch/m68k/configs/sun3x_defconfig
arch/s390/configs/debug_defconfig
arch/s390/configs/defconfig
fs/Kconfig
fs/Makefile
fs/bcachefs/Kconfig [deleted file]
fs/bcachefs/Makefile [deleted file]
fs/bcachefs/acl.c [deleted file]
fs/bcachefs/acl.h [deleted file]
fs/bcachefs/alloc_background.c [deleted file]
fs/bcachefs/alloc_background.h [deleted file]
fs/bcachefs/alloc_background_format.h [deleted file]
fs/bcachefs/alloc_foreground.c [deleted file]
fs/bcachefs/alloc_foreground.h [deleted file]
fs/bcachefs/alloc_types.h [deleted file]
fs/bcachefs/async_objs.c [deleted file]
fs/bcachefs/async_objs.h [deleted file]
fs/bcachefs/async_objs_types.h [deleted file]
fs/bcachefs/backpointers.c [deleted file]
fs/bcachefs/backpointers.h [deleted file]
fs/bcachefs/bbpos.h [deleted file]
fs/bcachefs/bbpos_types.h [deleted file]
fs/bcachefs/bcachefs.h [deleted file]
fs/bcachefs/bcachefs_format.h [deleted file]
fs/bcachefs/bcachefs_ioctl.h [deleted file]
fs/bcachefs/bkey.c [deleted file]
fs/bcachefs/bkey.h [deleted file]
fs/bcachefs/bkey_buf.h [deleted file]
fs/bcachefs/bkey_cmp.h [deleted file]
fs/bcachefs/bkey_methods.c [deleted file]
fs/bcachefs/bkey_methods.h [deleted file]
fs/bcachefs/bkey_sort.c [deleted file]
fs/bcachefs/bkey_sort.h [deleted file]
fs/bcachefs/bkey_types.h [deleted file]
fs/bcachefs/bset.c [deleted file]
fs/bcachefs/bset.h [deleted file]
fs/bcachefs/btree_cache.c [deleted file]
fs/bcachefs/btree_cache.h [deleted file]
fs/bcachefs/btree_gc.c [deleted file]
fs/bcachefs/btree_gc.h [deleted file]
fs/bcachefs/btree_gc_types.h [deleted file]
fs/bcachefs/btree_io.c [deleted file]
fs/bcachefs/btree_io.h [deleted file]
fs/bcachefs/btree_iter.c [deleted file]
fs/bcachefs/btree_iter.h [deleted file]
fs/bcachefs/btree_journal_iter.c [deleted file]
fs/bcachefs/btree_journal_iter.h [deleted file]
fs/bcachefs/btree_journal_iter_types.h [deleted file]
fs/bcachefs/btree_key_cache.c [deleted file]
fs/bcachefs/btree_key_cache.h [deleted file]
fs/bcachefs/btree_key_cache_types.h [deleted file]
fs/bcachefs/btree_locking.c [deleted file]
fs/bcachefs/btree_locking.h [deleted file]
fs/bcachefs/btree_node_scan.c [deleted file]
fs/bcachefs/btree_node_scan.h [deleted file]
fs/bcachefs/btree_node_scan_types.h [deleted file]
fs/bcachefs/btree_trans_commit.c [deleted file]
fs/bcachefs/btree_types.h [deleted file]
fs/bcachefs/btree_update.c [deleted file]
fs/bcachefs/btree_update.h [deleted file]
fs/bcachefs/btree_update_interior.c [deleted file]
fs/bcachefs/btree_update_interior.h [deleted file]
fs/bcachefs/btree_write_buffer.c [deleted file]
fs/bcachefs/btree_write_buffer.h [deleted file]
fs/bcachefs/btree_write_buffer_types.h [deleted file]
fs/bcachefs/buckets.c [deleted file]
fs/bcachefs/buckets.h [deleted file]
fs/bcachefs/buckets_types.h [deleted file]
fs/bcachefs/buckets_waiting_for_journal.c [deleted file]
fs/bcachefs/buckets_waiting_for_journal.h [deleted file]
fs/bcachefs/buckets_waiting_for_journal_types.h [deleted file]
fs/bcachefs/chardev.c [deleted file]
fs/bcachefs/chardev.h [deleted file]
fs/bcachefs/checksum.c [deleted file]
fs/bcachefs/checksum.h [deleted file]
fs/bcachefs/clock.c [deleted file]
fs/bcachefs/clock.h [deleted file]
fs/bcachefs/clock_types.h [deleted file]
fs/bcachefs/compress.c [deleted file]
fs/bcachefs/compress.h [deleted file]
fs/bcachefs/darray.c [deleted file]
fs/bcachefs/darray.h [deleted file]
fs/bcachefs/data_update.c [deleted file]
fs/bcachefs/data_update.h [deleted file]
fs/bcachefs/debug.c [deleted file]
fs/bcachefs/debug.h [deleted file]
fs/bcachefs/dirent.c [deleted file]
fs/bcachefs/dirent.h [deleted file]
fs/bcachefs/dirent_format.h [deleted file]
fs/bcachefs/disk_accounting.c [deleted file]
fs/bcachefs/disk_accounting.h [deleted file]
fs/bcachefs/disk_accounting_format.h [deleted file]
fs/bcachefs/disk_accounting_types.h [deleted file]
fs/bcachefs/disk_groups.c [deleted file]
fs/bcachefs/disk_groups.h [deleted file]
fs/bcachefs/disk_groups_format.h [deleted file]
fs/bcachefs/disk_groups_types.h [deleted file]
fs/bcachefs/ec.c [deleted file]
fs/bcachefs/ec.h [deleted file]
fs/bcachefs/ec_format.h [deleted file]
fs/bcachefs/ec_types.h [deleted file]
fs/bcachefs/enumerated_ref.c [deleted file]
fs/bcachefs/enumerated_ref.h [deleted file]
fs/bcachefs/enumerated_ref_types.h [deleted file]
fs/bcachefs/errcode.c [deleted file]
fs/bcachefs/errcode.h [deleted file]
fs/bcachefs/error.c [deleted file]
fs/bcachefs/error.h [deleted file]
fs/bcachefs/extent_update.c [deleted file]
fs/bcachefs/extent_update.h [deleted file]
fs/bcachefs/extents.c [deleted file]
fs/bcachefs/extents.h [deleted file]
fs/bcachefs/extents_format.h [deleted file]
fs/bcachefs/extents_types.h [deleted file]
fs/bcachefs/eytzinger.c [deleted file]
fs/bcachefs/eytzinger.h [deleted file]
fs/bcachefs/fast_list.c [deleted file]
fs/bcachefs/fast_list.h [deleted file]
fs/bcachefs/fifo.h [deleted file]
fs/bcachefs/fs-io-buffered.c [deleted file]
fs/bcachefs/fs-io-buffered.h [deleted file]
fs/bcachefs/fs-io-direct.c [deleted file]
fs/bcachefs/fs-io-direct.h [deleted file]
fs/bcachefs/fs-io-pagecache.c [deleted file]
fs/bcachefs/fs-io-pagecache.h [deleted file]
fs/bcachefs/fs-io.c [deleted file]
fs/bcachefs/fs-io.h [deleted file]
fs/bcachefs/fs-ioctl.c [deleted file]
fs/bcachefs/fs-ioctl.h [deleted file]
fs/bcachefs/fs.c [deleted file]
fs/bcachefs/fs.h [deleted file]
fs/bcachefs/fsck.c [deleted file]
fs/bcachefs/fsck.h [deleted file]
fs/bcachefs/inode.c [deleted file]
fs/bcachefs/inode.h [deleted file]
fs/bcachefs/inode_format.h [deleted file]
fs/bcachefs/io_misc.c [deleted file]
fs/bcachefs/io_misc.h [deleted file]
fs/bcachefs/io_read.c [deleted file]
fs/bcachefs/io_read.h [deleted file]
fs/bcachefs/io_write.c [deleted file]
fs/bcachefs/io_write.h [deleted file]
fs/bcachefs/io_write_types.h [deleted file]
fs/bcachefs/journal.c [deleted file]
fs/bcachefs/journal.h [deleted file]
fs/bcachefs/journal_io.c [deleted file]
fs/bcachefs/journal_io.h [deleted file]
fs/bcachefs/journal_reclaim.c [deleted file]
fs/bcachefs/journal_reclaim.h [deleted file]
fs/bcachefs/journal_sb.c [deleted file]
fs/bcachefs/journal_sb.h [deleted file]
fs/bcachefs/journal_seq_blacklist.c [deleted file]
fs/bcachefs/journal_seq_blacklist.h [deleted file]
fs/bcachefs/journal_seq_blacklist_format.h [deleted file]
fs/bcachefs/journal_types.h [deleted file]
fs/bcachefs/keylist.c [deleted file]
fs/bcachefs/keylist.h [deleted file]
fs/bcachefs/keylist_types.h [deleted file]
fs/bcachefs/logged_ops.c [deleted file]
fs/bcachefs/logged_ops.h [deleted file]
fs/bcachefs/logged_ops_format.h [deleted file]
fs/bcachefs/lru.c [deleted file]
fs/bcachefs/lru.h [deleted file]
fs/bcachefs/lru_format.h [deleted file]
fs/bcachefs/mean_and_variance.c [deleted file]
fs/bcachefs/mean_and_variance.h [deleted file]
fs/bcachefs/mean_and_variance_test.c [deleted file]
fs/bcachefs/migrate.c [deleted file]
fs/bcachefs/migrate.h [deleted file]
fs/bcachefs/move.c [deleted file]
fs/bcachefs/move.h [deleted file]
fs/bcachefs/move_types.h [deleted file]
fs/bcachefs/movinggc.c [deleted file]
fs/bcachefs/movinggc.h [deleted file]
fs/bcachefs/namei.c [deleted file]
fs/bcachefs/namei.h [deleted file]
fs/bcachefs/nocow_locking.c [deleted file]
fs/bcachefs/nocow_locking.h [deleted file]
fs/bcachefs/nocow_locking_types.h [deleted file]
fs/bcachefs/opts.c [deleted file]
fs/bcachefs/opts.h [deleted file]
fs/bcachefs/printbuf.c [deleted file]
fs/bcachefs/printbuf.h [deleted file]
fs/bcachefs/progress.c [deleted file]
fs/bcachefs/progress.h [deleted file]
fs/bcachefs/quota.c [deleted file]
fs/bcachefs/quota.h [deleted file]
fs/bcachefs/quota_format.h [deleted file]
fs/bcachefs/quota_types.h [deleted file]
fs/bcachefs/rcu_pending.c [deleted file]
fs/bcachefs/rcu_pending.h [deleted file]
fs/bcachefs/rebalance.c [deleted file]
fs/bcachefs/rebalance.h [deleted file]
fs/bcachefs/rebalance_format.h [deleted file]
fs/bcachefs/rebalance_types.h [deleted file]
fs/bcachefs/recovery.c [deleted file]
fs/bcachefs/recovery.h [deleted file]
fs/bcachefs/recovery_passes.c [deleted file]
fs/bcachefs/recovery_passes.h [deleted file]
fs/bcachefs/recovery_passes_format.h [deleted file]
fs/bcachefs/recovery_passes_types.h [deleted file]
fs/bcachefs/reflink.c [deleted file]
fs/bcachefs/reflink.h [deleted file]
fs/bcachefs/reflink_format.h [deleted file]
fs/bcachefs/replicas.c [deleted file]
fs/bcachefs/replicas.h [deleted file]
fs/bcachefs/replicas_format.h [deleted file]
fs/bcachefs/replicas_types.h [deleted file]
fs/bcachefs/sb-clean.c [deleted file]
fs/bcachefs/sb-clean.h [deleted file]
fs/bcachefs/sb-counters.c [deleted file]
fs/bcachefs/sb-counters.h [deleted file]
fs/bcachefs/sb-counters_format.h [deleted file]
fs/bcachefs/sb-downgrade.c [deleted file]
fs/bcachefs/sb-downgrade.h [deleted file]
fs/bcachefs/sb-downgrade_format.h [deleted file]
fs/bcachefs/sb-errors.c [deleted file]
fs/bcachefs/sb-errors.h [deleted file]
fs/bcachefs/sb-errors_format.h [deleted file]
fs/bcachefs/sb-errors_types.h [deleted file]
fs/bcachefs/sb-members.c [deleted file]
fs/bcachefs/sb-members.h [deleted file]
fs/bcachefs/sb-members_format.h [deleted file]
fs/bcachefs/sb-members_types.h [deleted file]
fs/bcachefs/seqmutex.h [deleted file]
fs/bcachefs/siphash.c [deleted file]
fs/bcachefs/siphash.h [deleted file]
fs/bcachefs/six.c [deleted file]
fs/bcachefs/six.h [deleted file]
fs/bcachefs/snapshot.c [deleted file]
fs/bcachefs/snapshot.h [deleted file]
fs/bcachefs/snapshot_format.h [deleted file]
fs/bcachefs/snapshot_types.h [deleted file]
fs/bcachefs/str_hash.c [deleted file]
fs/bcachefs/str_hash.h [deleted file]
fs/bcachefs/subvolume.c [deleted file]
fs/bcachefs/subvolume.h [deleted file]
fs/bcachefs/subvolume_format.h [deleted file]
fs/bcachefs/subvolume_types.h [deleted file]
fs/bcachefs/super-io.c [deleted file]
fs/bcachefs/super-io.h [deleted file]
fs/bcachefs/super.c [deleted file]
fs/bcachefs/super.h [deleted file]
fs/bcachefs/super_types.h [deleted file]
fs/bcachefs/sysfs.c [deleted file]
fs/bcachefs/sysfs.h [deleted file]
fs/bcachefs/tests.c [deleted file]
fs/bcachefs/tests.h [deleted file]
fs/bcachefs/thread_with_file.c [deleted file]
fs/bcachefs/thread_with_file.h [deleted file]
fs/bcachefs/thread_with_file_types.h [deleted file]
fs/bcachefs/time_stats.c [deleted file]
fs/bcachefs/time_stats.h [deleted file]
fs/bcachefs/trace.c [deleted file]
fs/bcachefs/trace.h [deleted file]
fs/bcachefs/two_state_shared_lock.c [deleted file]
fs/bcachefs/two_state_shared_lock.h [deleted file]
fs/bcachefs/util.c [deleted file]
fs/bcachefs/util.h [deleted file]
fs/bcachefs/varint.c [deleted file]
fs/bcachefs/varint.h [deleted file]
fs/bcachefs/vstructs.h [deleted file]
fs/bcachefs/xattr.c [deleted file]
fs/bcachefs/xattr.h [deleted file]
fs/bcachefs/xattr_format.h [deleted file]

diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst
deleted file mode 100644 (file)
index b29562a..0000000
+++ /dev/null
@@ -1,186 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-bcachefs coding style
-=====================
-
-Good development is like gardening, and codebases are our gardens. Tend to them
-every day; look for little things that are out of place or in need of tidying.
-A little weeding here and there goes a long way; don't wait until things have
-spiraled out of control.
-
-Things don't always have to be perfect - nitpicking often does more harm than
-good. But appreciate beauty when you see it - and let people know.
-
-The code that you are afraid to touch is the code most in need of refactoring.
-
-A little organizing here and there goes a long way.
-
-Put real thought into how you organize things.
-
-Good code is readable code, where the structure is simple and leaves nowhere
-for bugs to hide.
-
-Assertions are one of our most important tools for writing reliable code. If in
-the course of writing a patchset you encounter a condition that shouldn't
-happen (and will have unpredictable or undefined behaviour if it does), or
-you're not sure if it can happen and not sure how to handle it yet - make it a
-BUG_ON(). Don't leave undefined or unspecified behavior lurking in the codebase.
-
-By the time you finish the patchset, you should understand better which
-assertions need to be handled and turned into checks with error paths, and
-which should be logically impossible. Leave the BUG_ON()s in for the ones which
-are logically impossible. (Or, make them debug mode assertions if they're
-expensive - but don't turn everything into a debug mode assertion, so that
-we're not stuck debugging undefined behaviour should it turn out that you were
-wrong).
-
-Assertions are documentation that can't go out of date. Good assertions are
-wonderful.
-
-Good assertions drastically and dramatically reduce the amount of testing
-required to shake out bugs.
-
-Good assertions are based on state, not logic. To write good assertions, you
-have to think about what the invariants on your state are.
-
-Good invariants and assertions will hold everywhere in your codebase. This
-means that you can run them in only a few places in the checked in version, but
-should you need to debug something that caused the assertion to fail, you can
-quickly shotgun them everywhere to find the codepath that broke the invariant.
-
-A good assertion checks something that the compiler could check for us, and
-elide - if we were working in a language with embedded correctness proofs that
-the compiler could check. This is something that exists today, but it'll likely
-still be a few decades before it comes to systems programming languages. But we
-can still incorporate that kind of thinking into our code and document the
-invariants with runtime checks - much like the way people working in
-dynamically typed languages may add type annotations, gradually making their
-code statically typed.
-
-Looking for ways to make your assertions simpler - and higher level - will
-often nudge you towards making the entire system simpler and more robust.
-
-Good code is code where you can poke around and see what it's doing -
-introspection. We can't debug anything if we can't see what's going on.
-
-Whenever we're debugging, and the solution isn't immediately obvious, if the
-issue is that we don't know where the issue is because we can't see what's
-going on - fix that first.
-
-We have the tools to make anything visible at runtime, efficiently - RCU and
-percpu data structures among them. Don't let things stay hidden.
-
-The most important tool for introspection is the humble pretty printer - in
-bcachefs, this means `*_to_text()` functions, which output to printbufs.
-
-Pretty printers are wonderful, because they compose and you can use them
-everywhere. Having functions to print whatever object you're working with will
-make your error messages much easier to write (therefore they will actually
-exist) and much more informative. And they can be used from sysfs/debugfs, as
-well as tracepoints.
-
-Runtime info and debugging tools should come with clear descriptions and
-labels, and good structure - we don't want files with a list of bare integers,
-like in procfs. Part of the job of the debugging tools is to educate users and
-new developers as to how the system works.
-
-Error messages should, whenever possible, tell you everything you need to debug
-the issue. It's worth putting effort into them.
-
-Tracepoints shouldn't be the first thing you reach for. They're an important
-tool, but always look for more immediate ways to make things visible. When we
-have to rely on tracing, we have to know which tracepoints we're looking for,
-and then we have to run the troublesome workload, and then we have to sift
-through logs. This is a lot of steps to go through when a user is hitting
-something, and if it's intermittent it may not even be possible.
-
-The humble counter is an incredibly useful tool. They're cheap and simple to
-use, and many complicated internal operations with lots of things that can
-behave weirdly (anything involving memory reclaim, for example) become
-shockingly easy to debug once you have counters on every distinct codepath.
-
-Persistent counters are even better.
-
-When debugging, try to get the most out of every bug you come across; don't
-rush to fix the initial issue. Look for things that will make related bugs
-easier the next time around - introspection, new assertions, better error
-messages, new debug tools, and do those first. Look for ways to make the system
-better behaved; often one bug will uncover several other bugs through
-downstream effects.
-
-Fix all that first, and then the original bug last - even if that means keeping
-a user waiting. They'll thank you in the long run, and when they understand
-what you're doing you'll be amazed at how patient they're happy to be. Users
-like to help - otherwise they wouldn't be reporting the bug in the first place.
-
-Talk to your users. Don't isolate yourself.
-
-Users notice all sorts of interesting things, and by just talking to them and
-interacting with them you can benefit from their experience.
-
-Spend time doing support and helpdesk stuff. Don't just write code - code isn't
-finished until it's being used trouble free.
-
-This will also motivate you to make your debugging tools as good as possible,
-and perhaps even your documentation, too. Like anything else in life, the more
-time you spend at it the better you'll get, and you the developer are the
-person most able to improve the tools to make debugging quick and easy.
-
-Be wary of how you take on and commit to big projects. Don't let development
-become product-manager focused. Often time an idea is a good one but needs to
-wait for its proper time - but you won't know if it's the proper time for an
-idea until you start writing code.
-
-Expect to throw a lot of things away, or leave them half finished for later.
-Nobody writes all perfect code that all gets shipped, and you'll be much more
-productive in the long run if you notice this early and shift to something
-else. The experience gained and lessons learned will be valuable for all the
-other work you do.
-
-But don't be afraid to tackle projects that require significant rework of
-existing code. Sometimes these can be the best projects, because they can lead
-us to make existing code more general, more flexible, more multipurpose and
-perhaps more robust. Just don't hesitate to abandon the idea if it looks like
-it's going to make a mess of things.
-
-Complicated features can often be done as a series of refactorings, with the
-final change that actually implements the feature as a quite small patch at the
-end. It's wonderful when this happens, especially when those refactorings are
-things that improve the codebase in their own right. When that happens there's
-much less risk of wasted effort if the feature you were going for doesn't work
-out.
-
-Always strive to work incrementally. Always strive to turn the big projects
-into little bite sized projects that can prove their own merits.
-
-Instead of always tackling those big projects, look for little things that
-will be useful, and make the big projects easier.
-
-The question of what's likely to be useful is where junior developers most
-often go astray - doing something because it seems like it'll be useful often
-leads to overengineering. Knowing what's useful comes from many years of
-experience, or talking with people who have that experience - or from simply
-reading lots of code and looking for common patterns and issues. Don't be
-afraid to throw things away and do something simpler.
-
-Talk about your ideas with your fellow developers; often times the best things
-come from relaxed conversations where people aren't afraid to say "what if?".
-
-Don't neglect your tools.
-
-The most important tools (besides the compiler and our text editor) are the
-tools we use for testing. The shortest possible edit/test/debug cycle is
-essential for working productively. We learn, gain experience, and discover the
-errors in our thinking by running our code and seeing what happens. If your
-time is being wasted because your tools are bad or too slow - don't accept it,
-fix it.
-
-Put effort into your documentation, commit messages, and code comments - but
-don't go overboard. A good commit message is wonderful - but if the information
-was important enough to go in a commit message, ask yourself if it would be
-even better as a code comment.
-
-A good code comment is wonderful, but even better is the comment that didn't
-need to exist because the code was so straightforward as to be obvious;
-organized into small clean and tidy modules, with clear and descriptive names
-for functions and variables, where every line of code has a clear purpose.
diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst
deleted file mode 100644 (file)
index 18c79d5..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-Submitting patches to bcachefs
-==============================
-
-Here are suggestions for submitting patches to bcachefs subsystem.
-
-Submission checklist
---------------------
-
-Patches must be tested before being submitted, either with the xfstests suite
-[0]_, or the full bcachefs test suite in ktest [1]_, depending on what's being
-touched. Note that ktest wraps xfstests and will be an easier method to running
-it for most users; it includes single-command wrappers for all the mainstream
-in-kernel local filesystems.
-
-Patches will undergo more testing after being merged (including
-lockdep/kasan/preempt/etc. variants), these are not generally required to be
-run by the submitter - but do put some thought into what you're changing and
-which tests might be relevant, e.g. are you dealing with tricky memory layout
-work? kasan, are you doing locking work? then lockdep; and ktest includes
-single-command variants for the debug build types you'll most likely need.
-
-The exception to this rule is incomplete WIP/RFC patches: if you're working on
-something nontrivial, it's encouraged to send out a WIP patch to let people
-know what you're doing and make sure you're on the right track. Just make sure
-it includes a brief note as to what's done and what's incomplete, to avoid
-confusion.
-
-Rigorous checkpatch.pl adherence is not required (many of its warnings are
-considered out of date), but try not to deviate too much without reason.
-
-Focus on writing code that reads well and is organized well; code should be
-aesthetically pleasing.
-
-CI
---
-
-Instead of running your tests locally, when running the full test suite it's
-preferable to let a server farm do it in parallel, and then have the results
-in a nice test dashboard (which can tell you which failures are new, and
-presents results in a git log view, avoiding the need for most bisecting).
-
-That exists [2]_, and community members may request an account. If you work for
-a big tech company, you'll need to help out with server costs to get access -
-but the CI is not restricted to running bcachefs tests: it runs any ktest test
-(which generally makes it easy to wrap other tests that can run in qemu).
-
-Other things to think about
----------------------------
-
-- How will we debug this code? Is there sufficient introspection to diagnose
-  when something starts acting wonky on a user machine?
-
-  We don't necessarily need every single field of every data structure visible
-  with introspection, but having the important fields of all the core data
-  types wired up makes debugging drastically easier - a bit of thoughtful
-  foresight greatly reduces the need to have people build custom kernels with
-  debug patches.
-
-  More broadly, think about all the debug tooling that might be needed.
-
-- Does it make the codebase more or less of a mess? Can we also try to do some
-  organizing, too?
-
-- Do new tests need to be written? New assertions? How do we know and verify
-  that the code is correct, and what happens if something goes wrong?
-
-  We don't yet have automated code coverage analysis or easy fault injection -
-  but for now, pretend we did and ask what they might tell us.
-
-  Assertions are hugely important, given that we don't yet have a systems
-  language that can do ergonomic embedded correctness proofs. Hitting an assert
-  in testing is much better than wandering off into undefined behaviour la-la
-  land - use them. Use them judiciously, and not as a replacement for proper
-  error handling, but use them.
-
-- Does it need to be performance tested? Should we add new performance counters?
-
-  bcachefs has a set of persistent runtime counters which can be viewed with
-  the 'bcachefs fs top' command; this should give users a basic idea of what
-  their filesystem is currently doing. If you're doing a new feature or looking
-  at old code, think if anything should be added.
-
-- If it's a new on disk format feature - have upgrades and downgrades been
-  tested? (Automated tests exists but aren't in the CI, due to the hassle of
-  disk image management; coordinate to have them run.)
-
-Mailing list, IRC
------------------
-
-Patches should hit the list [3]_, but much discussion and code review happens
-on IRC as well [4]_; many people appreciate the more conversational approach
-and quicker feedback.
-
-Additionally, we have a lively user community doing excellent QA work, which
-exists primarily on IRC. Please make use of that resource; user feedback is
-important for any nontrivial feature, and documenting it in commit messages
-would be a good idea.
-
-.. rubric:: References
-
-.. [0] git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git
-.. [1] https://evilpiepirate.org/git/ktest.git/
-.. [2] https://evilpiepirate.org/~testdashboard/ci/
-.. [3] linux-bcachefs@vger.kernel.org
-.. [4] irc.oftc.net#bcache, #bcachefs-dev
diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst
deleted file mode 100644 (file)
index 871a38f..0000000
+++ /dev/null
@@ -1,108 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-Casefolding
-===========
-
-bcachefs has support for case-insensitive file and directory
-lookups using the regular `chattr +F` (`S_CASEFOLD`, `FS_CASEFOLD_FL`)
-casefolding attributes.
-
-The main usecase for casefolding is compatibility with software written
-against other filesystems that rely on casefolded lookups
-(eg. NTFS and Wine/Proton).
-Taking advantage of file-system level casefolding can lead to great
-loading time gains in many applications and games.
-
-Casefolding support requires a kernel with the `CONFIG_UNICODE` enabled.
-Once a directory has been flagged for casefolding, a feature bit
-is enabled on the superblock which marks the filesystem as using
-casefolding.
-When the feature bit for casefolding is enabled, it is no longer possible
-to mount that filesystem on kernels without `CONFIG_UNICODE` enabled.
-
-On the lookup/query side: casefolding is implemented by allocating a new
-string of `BCH_NAME_MAX` length using the `utf8_casefold` function to
-casefold the query string.
-
-On the dirent side: casefolding is implemented by ensuring the `bkey`'s
-hash is made from the casefolded string and storing the cached casefolded
-name with the regular name in the dirent.
-
-The structure looks like this:
-
-* Regular:    [dirent data][regular name][nul][nul]...
-* Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]...
-
-(Do note, the number of NULs here is merely for illustration; their count can
-vary per-key, and they may not even be present if the key is aligned to
-`sizeof(u64)`.)
-
-This is efficient as it means that for all file lookups that require casefolding,
-it has identical performance to a regular lookup:
-a hash comparison and a `memcmp` of the name.
-
-Rationale
----------
-
-Several designs were considered for this system:
-One was to introduce a dirent_v2, however that would be painful especially as
-the hash system only has support for a single key type. This would also need
-`BCH_NAME_MAX` to change between versions, and a new feature bit.
-
-Another option was to store without the two lengths, and just take the length of
-the regular name and casefolded name contiguously / 2 as the length. This would
-assume that the regular length == casefolded length, but that could potentially
-not be true, if the uppercase unicode glyph had a different UTF-8 encoding than
-the lowercase unicode glyph.
-It would be possible to disregard the casefold cache for those cases, but it was
-decided to simply encode the two string lengths in the key to avoid random
-performance issues if this edgecase was ever hit.
-
-The option settled on was to use a free-bit in d_type to mark a dirent as having
-a casefold cache, and then treat the first 4 bytes the name block as lengths.
-You can see this in the `d_cf_name_block` member of union in `bch_dirent`.
-
-The feature bit was used to allow casefolding support to be enabled for the majority
-of users, but some allow users who have no need for the feature to still use bcachefs as
-`CONFIG_UNICODE` can increase the kernel side a significant amount due to the tables used,
-which may be decider between using bcachefs for eg. embedded platforms.
-
-Other filesystems like ext4 and f2fs have a super-block level option for casefolding
-encoding, but bcachefs currently does not provide this. ext4 and f2fs do not expose
-any encodings than a single UTF-8 version. When future encodings are desirable,
-they will be added trivially using the opts mechanism.
-
-dentry/dcache considerations
-----------------------------
-
-Currently, in casefolded directories, bcachefs (like other filesystems) will not cache
-negative dentry's.
-
-This is because currently doing so presents a problem in the following scenario:
-
- - Lookup file "blAH" in a casefolded directory
- - Creation of file "BLAH" in a casefolded directory
- - Lookup file "blAH" in a casefolded directory
-
-This would fail if negative dentry's were cached.
-
-This is slightly suboptimal, but could be fixed in future with some vfs work.
-
-
-References
-----------
-
-(from Peter Anvin, on the list)
-
-It is worth noting that Microsoft has basically declared their
-"recommended" case folding (upcase) table to be permanently frozen (for
-new filesystem instances in the case where they use an on-disk
-translation table created at format time.)  As far as I know they have
-never supported anything other than 1:1 conversion of BMP code points,
-nor normalization.
-
-The exFAT specification enumerates the full recommended upcase table,
-although in a somewhat annoying format (basically a hex dump of
-compressed data):
-
-https://learn.microsoft.com/en-us/windows/win32/fileio/exfat-specification
diff --git a/Documentation/filesystems/bcachefs/errorcodes.rst b/Documentation/filesystems/bcachefs/errorcodes.rst
deleted file mode 100644 (file)
index 2cccaa0..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-bcachefs private error codes
-----------------------------
-
-In bcachefs, as a hard rule we do not throw or directly use standard error
-codes (-EINVAL, -EBUSY, etc.). Instead, we define private error codes as needed
-in fs/bcachefs/errcode.h.
-
-This gives us much better error messages and makes debugging much easier. Any
-direct uses of standard error codes you see in the source code are simply old
-code that has yet to be converted - feel free to clean it up!
-
-Private error codes may subtype another error code, this allows for grouping of
-related errors that should be handled similarly (e.g. transaction restart
-errors), as well as specifying which standard error code should be returned at
-the bcachefs module boundary.
-
-At the module boundary, we use bch2_err_class() to convert to a standard error
-code; this also emits a trace event so that the original error code be
-recovered even if it wasn't logged.
-
-Do not reuse error codes! Generally speaking, a private error code should only
-be thrown in one place. That means that when we see it in a log message we can
-see, unambiguously, exactly which file and line number it was returned from.
-
-Try to give error codes names that are as reasonably descriptive of the error
-as possible. Frequently, the error will be logged at a place far removed from
-where the error was generated; good names for error codes mean much more
-descriptive and useful error messages.
diff --git a/Documentation/filesystems/bcachefs/future/idle_work.rst b/Documentation/filesystems/bcachefs/future/idle_work.rst
deleted file mode 100644 (file)
index 59a3325..0000000
+++ /dev/null
@@ -1,78 +0,0 @@
-Idle/background work classes design doc:
-
-Right now, our behaviour at idle isn't ideal, it was designed for servers that
-would be under sustained load, to keep pending work at a "medium" level, to
-let work build up so we can process it in more efficient batches, while also
-giving headroom for bursts in load.
-
-But for desktops or mobile - scenarios where work is less sustained and power
-usage is more important - we want to operate differently, with a "rush to
-idle" so the system can go to sleep. We don't want to be dribbling out
-background work while the system should be idle.
-
-The complicating factor is that there are a number of background tasks, which
-form a heirarchy (or a digraph, depending on how you divide it up) - one
-background task may generate work for another.
-
-Thus proper idle detection needs to model this heirarchy.
-
-- Foreground writes
-- Page cache writeback
-- Copygc, rebalance
-- Journal reclaim
-
-When we implement idle detection and rush to idle, we need to be careful not
-to disturb too much the existing behaviour that works reasonably well when the
-system is under sustained load (or perhaps improve it in the case of
-rebalance, which currently does not actively attempt to let work batch up).
-
-SUSTAINED LOAD REGIME
----------------------
-
-When the system is under continuous load, we want these jobs to run
-continuously - this is perhaps best modelled with a P/D controller, where
-they'll be trying to keep a target value (i.e. fragmented disk space,
-available journal space) roughly in the middle of some range.
-
-The goal under sustained load is to balance our ability to handle load spikes
-without running out of x resource (free disk space, free space in the
-journal), while also letting some work accumululate to be batched (or become
-unnecessary).
-
-For example, we don't want to run copygc too aggressively, because then it
-will be evacuating buckets that would have become empty (been overwritten or
-deleted) anyways, and we don't want to wait until we're almost out of free
-space because then the system will behave unpredicably - suddenly we're doing
-a lot more work to service each write and the system becomes much slower.
-
-IDLE REGIME
------------
-
-When the system becomes idle, we should start flushing our pending work
-quicker so the system can go to sleep.
-
-Note that the definition of "idle" depends on where in the heirarchy a task
-is - a task should start flushing work more quickly when the task above it has
-stopped generating new work.
-
-e.g. rebalance should start flushing more quickly when page cache writeback is
-idle, and journal reclaim should only start flushing more quickly when both
-copygc and rebalance are idle.
-
-It's important to let work accumulate when more work is still incoming and we
-still have room, because flushing is always more efficient if we let it batch
-up. New writes may overwrite data before rebalance moves it, and tasks may be
-generating more updates for the btree nodes that journal reclaim needs to flush.
-
-On idle, how much work we do at each interval should be proportional to the
-length of time we have been idle for. If we're idle only for a short duration,
-we shouldn't flush everything right away; the system might wake up and start
-generating new work soon, and flushing immediately might end up doing a lot of
-work that would have been unnecessary if we'd allowed things to batch more.
-To summarize, we will need:
-
- - A list of classes for background tasks that generate work, which will
-   include one "foreground" class.
- - Tracking for each class - "Am I doing work, or have I gone to sleep?"
- - And each class should check the class above it when deciding how much work to issue.
diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst
deleted file mode 100644 (file)
index e5c4c21..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-======================
-bcachefs Documentation
-======================
-
-Subsystem-specific development process notes
---------------------------------------------
-
-Development notes specific to bcachefs. These are intended to supplement
-:doc:`general kernel development handbook </process/index>`.
-
-.. toctree::
-   :maxdepth: 1
-   :numbered:
-
-   CodingStyle
-   SubmittingPatches
-
-Filesystem implementation
--------------------------
-
-Documentation for filesystem features and their implementation details.
-At this moment, only a few of these are described here.
-
-.. toctree::
-   :maxdepth: 1
-   :numbered:
-
-   casefolding
-   errorcodes
-
-Future design
--------------
-.. toctree::
-   :maxdepth: 1
-
-   future/idle_work
index 11a599387266a42f8ac6496e97485c437f330333..622187a96bdc6cb362955871d2a0546e99742592 100644 (file)
@@ -72,7 +72,6 @@ Documentation for filesystem implementations.
    afs
    autofs
    autofs-mount-control
-   bcachefs/index
    befs
    bfs
    btrfs
index 8eaecbb2cc2bafc34810e87534a8349b95036632..4fd5b2ec4f85c65f55fc618ea4268ba9190b7404 100644 (file)
@@ -4217,10 +4217,7 @@ M:       Kent Overstreet <kent.overstreet@linux.dev>
 L:     linux-bcachefs@vger.kernel.org
 S:     Externally maintained
 C:     irc://irc.oftc.net/bcache
-P:      Documentation/filesystems/bcachefs/SubmittingPatches.rst
 T:     git https://evilpiepirate.org/git/bcachefs.git
-F:     fs/bcachefs/
-F:     Documentation/filesystems/bcachefs/
 
 BDISP ST MEDIA DRIVER
 M:     Fabien Dessenne <fabien.dessenne@foss.st.com>
index 5171bb183967b91f740f44236c14cfbddb4edbf0..b5546a3ac9c5bd0122371854836fa48660e2666b 100644 (file)
@@ -454,7 +454,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index 16f343ae48c67592b53daee968c96cc94bef25cc..4ea0d686e28eaa8e665bf9604b233e207f2a8c7a 100644 (file)
@@ -411,7 +411,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index c08788728ea9622ef8fa91e6e77ea456e89ed8f3..0698d9d4b04e8c1880d5551dd8dd91195e1ef598 100644 (file)
@@ -431,7 +431,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index 962497e7c53fd62066b2ed8ad04f4fac01209e8d..45d1ee0860e531675a6bb831acb50fc31b8bde07 100644 (file)
@@ -403,7 +403,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index ec28650189e406dbd1dcb56a91d8a007ae90cc22..e5794b906b655fba2b9677a5c28267958598a4f9 100644 (file)
@@ -413,7 +413,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index 0afb3ad180dee355dae8639e819eb98bf7d59493..fb84ba4c13500698d773a6fa9f1011a754226ac3 100644 (file)
@@ -430,7 +430,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index b311e953995d6da2ee5d270582efdaf29ae60216..9a05e05523fc65d1fe29d5422de40da4654006b4 100644 (file)
@@ -517,7 +517,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index f4e6224f137f99f6343e68d8b26a8c18c2b27626..0e30aa574a2b492b9357fc829ecb7887c01086a8 100644 (file)
@@ -403,7 +403,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index 498e167222f18c7eda7e84fd911232d75d1683d0..d6f5600d941065eadc192d359ac22bad48aacc0e 100644 (file)
@@ -404,7 +404,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index 8c6b1eef8534237af0d947b1fdba7c6be553c2dd..16f0adff4ada7fca480878e5ddb772f95c67e50b 100644 (file)
@@ -420,7 +420,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index c34648f299efb99a72b566b84cac30ac4639f44e..5b0e273be74bdb13a8eae023a6fd0b44cfa1b646 100644 (file)
@@ -401,7 +401,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index 73810d14660f2177d6f49e38ca9e95421c44f2b9..3851d6720fac45eed0d4d00b4a4f9e2a47219db4 100644 (file)
@@ -401,7 +401,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
index 5e616bc988ac355b94d87c1153c09d3f9bfffc89..8ceef305f13875962f4246389c67274dcd207e30 100644 (file)
@@ -658,9 +658,6 @@ CONFIG_BTRFS_FS_POSIX_ACL=y
 CONFIG_BTRFS_DEBUG=y
 CONFIG_BTRFS_ASSERT=y
 CONFIG_NILFS2_FS=m
-CONFIG_BCACHEFS_FS=y
-CONFIG_BCACHEFS_QUOTA=y
-CONFIG_BCACHEFS_POSIX_ACL=y
 CONFIG_FS_DAX=y
 CONFIG_EXPORTFS_BLOCK_OPS=y
 CONFIG_FS_ENCRYPTION=y
index 094599cdaf4d9b8473dad2cdcf6d3fca165cca90..3fc12b0af55b25e4d1c94dd2c9ad88ac69dc1d91 100644 (file)
@@ -645,9 +645,6 @@ CONFIG_OCFS2_FS=m
 CONFIG_BTRFS_FS=y
 CONFIG_BTRFS_FS_POSIX_ACL=y
 CONFIG_NILFS2_FS=m
-CONFIG_BCACHEFS_FS=m
-CONFIG_BCACHEFS_QUOTA=y
-CONFIG_BCACHEFS_POSIX_ACL=y
 CONFIG_FS_DAX=y
 CONFIG_EXPORTFS_BLOCK_OPS=y
 CONFIG_FS_ENCRYPTION=y
index c654a3642897001d2afbb110ec249a3dda63f0fd..7815379032dacb3997d76db45851aad05f0c13ff 100644 (file)
@@ -51,7 +51,6 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 source "fs/f2fs/Kconfig"
-source "fs/bcachefs/Kconfig"
 source "fs/zonefs/Kconfig"
 
 endif # BLOCK
index 334654f9584b9449ee5ec4dfe4a794261b6f0b9d..e3523ab2e587131b1591414fc8a6c43c990e4430 100644 (file)
@@ -121,7 +121,6 @@ obj-$(CONFIG_OCFS2_FS)              += ocfs2/
 obj-$(CONFIG_BTRFS_FS)         += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_F2FS_FS)          += f2fs/
-obj-$(CONFIG_BCACHEFS_FS)      += bcachefs/
 obj-$(CONFIG_CEPH_FS)          += ceph/
 obj-$(CONFIG_PSTORE)           += pstore/
 obj-$(CONFIG_EFIVAR_FS)                += efivarfs/
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
deleted file mode 100644 (file)
index 8cb2b9d..0000000
+++ /dev/null
@@ -1,121 +0,0 @@
-
-config BCACHEFS_FS
-       tristate "bcachefs filesystem support (EXPERIMENTAL)"
-       depends on BLOCK
-       select EXPORTFS
-       select CLOSURES
-       select CRC32
-       select CRC64
-       select FS_POSIX_ACL
-       select LZ4_COMPRESS
-       select LZ4_DECOMPRESS
-       select LZ4HC_COMPRESS
-       select LZ4HC_DECOMPRESS
-       select ZLIB_DEFLATE
-       select ZLIB_INFLATE
-       select ZSTD_COMPRESS
-       select ZSTD_DECOMPRESS
-       select CRYPTO_LIB_SHA256
-       select CRYPTO_LIB_CHACHA
-       select CRYPTO_LIB_POLY1305
-       select KEYS
-       select RAID6_PQ
-       select XOR_BLOCKS
-       select XXHASH
-       select SRCU
-       select SYMBOLIC_ERRNAME
-       select MIN_HEAP
-       select XARRAY_MULTI
-       help
-       The bcachefs filesystem - a modern, copy on write filesystem, with
-       support for multiple devices, compression, checksumming, etc.
-
-config BCACHEFS_QUOTA
-       bool "bcachefs quota support"
-       depends on BCACHEFS_FS
-       select QUOTACTL
-
-config BCACHEFS_ERASURE_CODING
-       bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
-       depends on BCACHEFS_FS
-       select QUOTACTL
-       help
-       This enables the "erasure_code" filesysystem and inode option, which
-       organizes data into reed-solomon stripes instead of ordinary
-       replication.
-
-       WARNING: this feature is still undergoing on disk format changes, and
-       should only be enabled for testing purposes.
-
-config BCACHEFS_POSIX_ACL
-       bool "bcachefs POSIX ACL support"
-       depends on BCACHEFS_FS
-       select FS_POSIX_ACL
-
-config BCACHEFS_DEBUG
-       bool "bcachefs debugging"
-       depends on BCACHEFS_FS
-       help
-       Enables many extra debugging checks and assertions.
-
-       The resulting code will be significantly slower than normal; you
-       probably shouldn't select this option unless you're a developer.
-
-config BCACHEFS_INJECT_TRANSACTION_RESTARTS
-       bool "Randomly inject transaction restarts"
-       depends on BCACHEFS_DEBUG
-       help
-       Randomly inject transaction restarts in a few core paths - may have a
-       significant performance penalty
-
-config BCACHEFS_TESTS
-       bool "bcachefs unit and performance tests"
-       depends on BCACHEFS_FS
-       help
-       Include some unit and performance tests for the core btree code
-
-config BCACHEFS_LOCK_TIME_STATS
-       bool "bcachefs lock time statistics"
-       depends on BCACHEFS_FS
-       help
-       Expose statistics for how long we held a lock in debugfs
-
-config BCACHEFS_NO_LATENCY_ACCT
-       bool "disable latency accounting and time stats"
-       depends on BCACHEFS_FS
-       help
-       This disables device latency tracking and time stats, only for performance testing
-
-config BCACHEFS_SIX_OPTIMISTIC_SPIN
-       bool "Optimistic spinning for six locks"
-       depends on BCACHEFS_FS
-       depends on SMP
-       default y
-       help
-       Instead of immediately sleeping when attempting to take a six lock that
-       is held by another thread, spin for a short while, as long as the
-       thread owning the lock is running.
-
-config BCACHEFS_PATH_TRACEPOINTS
-       bool "Extra btree_path tracepoints"
-       depends on BCACHEFS_FS && TRACING
-       help
-       Enable extra tracepoints for debugging btree_path operations; we don't
-       normally want these enabled because they happen at very high rates.
-
-config BCACHEFS_TRANS_KMALLOC_TRACE
-       bool "Trace bch2_trans_kmalloc() calls"
-       depends on BCACHEFS_FS
-
-config BCACHEFS_ASYNC_OBJECT_LISTS
-       bool "Keep async objects on fast_lists for debugfs visibility"
-       depends on BCACHEFS_FS && DEBUG_FS
-
-config MEAN_AND_VARIANCE_UNIT_TEST
-       tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
-       depends on KUNIT
-       depends on BCACHEFS_FS
-       default KUNIT_ALL_TESTS
-       help
-         This option enables the kunit tests for mean_and_variance module.
-         If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
deleted file mode 100644 (file)
index 93c8ee5..0000000
+++ /dev/null
@@ -1,107 +0,0 @@
-
-obj-$(CONFIG_BCACHEFS_FS)      += bcachefs.o
-
-bcachefs-y             :=      \
-       acl.o                   \
-       alloc_background.o      \
-       alloc_foreground.o      \
-       backpointers.o          \
-       bkey.o                  \
-       bkey_methods.o          \
-       bkey_sort.o             \
-       bset.o                  \
-       btree_cache.o           \
-       btree_gc.o              \
-       btree_io.o              \
-       btree_iter.o            \
-       btree_journal_iter.o    \
-       btree_key_cache.o       \
-       btree_locking.o         \
-       btree_node_scan.o       \
-       btree_trans_commit.o    \
-       btree_update.o          \
-       btree_update_interior.o \
-       btree_write_buffer.o    \
-       buckets.o               \
-       buckets_waiting_for_journal.o   \
-       chardev.o               \
-       checksum.o              \
-       clock.o                 \
-       compress.o              \
-       darray.o                \
-       data_update.o           \
-       debug.o                 \
-       dirent.o                \
-       disk_accounting.o       \
-       disk_groups.o           \
-       ec.o                    \
-       enumerated_ref.o        \
-       errcode.o               \
-       error.o                 \
-       extents.o               \
-       extent_update.o         \
-       eytzinger.o             \
-       fast_list.o             \
-       fs.o                    \
-       fs-ioctl.o              \
-       fs-io.o                 \
-       fs-io-buffered.o        \
-       fs-io-direct.o          \
-       fs-io-pagecache.o       \
-       fsck.o                  \
-       inode.o                 \
-       io_read.o               \
-       io_misc.o               \
-       io_write.o              \
-       journal.o               \
-       journal_io.o            \
-       journal_reclaim.o       \
-       journal_sb.o            \
-       journal_seq_blacklist.o \
-       keylist.o               \
-       logged_ops.o            \
-       lru.o                   \
-       mean_and_variance.o     \
-       migrate.o               \
-       move.o                  \
-       movinggc.o              \
-       namei.o                 \
-       nocow_locking.o         \
-       opts.o                  \
-       printbuf.o              \
-       progress.o              \
-       quota.o                 \
-       rebalance.o             \
-       rcu_pending.o           \
-       recovery.o              \
-       recovery_passes.o       \
-       reflink.o               \
-       replicas.o              \
-       sb-clean.o              \
-       sb-counters.o           \
-       sb-downgrade.o          \
-       sb-errors.o             \
-       sb-members.o            \
-       siphash.o               \
-       six.o                   \
-       snapshot.o              \
-       str_hash.o              \
-       subvolume.o             \
-       super.o                 \
-       super-io.o              \
-       sysfs.o                 \
-       tests.o                 \
-       time_stats.o            \
-       thread_with_file.o      \
-       trace.o                 \
-       two_state_shared_lock.o \
-       util.o                  \
-       varint.o                \
-       xattr.o
-
-bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS)   += async_objs.o
-
-obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
-
-# Silence "note: xyz changed in GCC X.X" messages
-subdir-ccflags-y += $(call cc-disable-warning, psabi)
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
deleted file mode 100644 (file)
index d03adc3..0000000
+++ /dev/null
@@ -1,445 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-
-#include "acl.h"
-#include "xattr.h"
-
-#include <linux/posix_acl.h>
-
-static const char * const acl_types[] = {
-       [ACL_USER_OBJ]  = "user_obj",
-       [ACL_USER]      = "user",
-       [ACL_GROUP_OBJ] = "group_obj",
-       [ACL_GROUP]     = "group",
-       [ACL_MASK]      = "mask",
-       [ACL_OTHER]     = "other",
-       NULL,
-};
-
-void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
-{
-       const void *p, *end = value + size;
-
-       if (!value ||
-           size < sizeof(bch_acl_header) ||
-           ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
-               return;
-
-       p = value + sizeof(bch_acl_header);
-       while (p < end) {
-               const bch_acl_entry *in = p;
-               unsigned tag = le16_to_cpu(in->e_tag);
-
-               prt_str(out, acl_types[tag]);
-
-               switch (tag) {
-               case ACL_USER_OBJ:
-               case ACL_GROUP_OBJ:
-               case ACL_MASK:
-               case ACL_OTHER:
-                       p += sizeof(bch_acl_entry_short);
-                       break;
-               case ACL_USER:
-                       prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
-                       p += sizeof(bch_acl_entry);
-                       break;
-               case ACL_GROUP:
-                       prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
-                       p += sizeof(bch_acl_entry);
-                       break;
-               }
-
-               prt_printf(out, " %o", le16_to_cpu(in->e_perm));
-
-               if (p != end)
-                       prt_char(out, ' ');
-       }
-}
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
-#include "fs.h"
-
-#include <linux/fs.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
-{
-       return sizeof(bch_acl_header) +
-               sizeof(bch_acl_entry_short) * nr_short +
-               sizeof(bch_acl_entry) * nr_long;
-}
-
-static inline int acl_to_xattr_type(int type)
-{
-       switch (type) {
-       case ACL_TYPE_ACCESS:
-               return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
-       case ACL_TYPE_DEFAULT:
-               return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
-       default:
-               BUG();
-       }
-}
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
-                                           const void *value, size_t size)
-{
-       const void *p, *end = value + size;
-       struct posix_acl *acl;
-       struct posix_acl_entry *out;
-       unsigned count = 0;
-       int ret;
-
-       if (!value)
-               return NULL;
-       if (size < sizeof(bch_acl_header))
-               goto invalid;
-       if (((bch_acl_header *)value)->a_version !=
-           cpu_to_le32(BCH_ACL_VERSION))
-               goto invalid;
-
-       p = value + sizeof(bch_acl_header);
-       while (p < end) {
-               const bch_acl_entry *entry = p;
-
-               if (p + sizeof(bch_acl_entry_short) > end)
-                       goto invalid;
-
-               switch (le16_to_cpu(entry->e_tag)) {
-               case ACL_USER_OBJ:
-               case ACL_GROUP_OBJ:
-               case ACL_MASK:
-               case ACL_OTHER:
-                       p += sizeof(bch_acl_entry_short);
-                       break;
-               case ACL_USER:
-               case ACL_GROUP:
-                       p += sizeof(bch_acl_entry);
-                       break;
-               default:
-                       goto invalid;
-               }
-
-               count++;
-       }
-
-       if (p > end)
-               goto invalid;
-
-       if (!count)
-               return NULL;
-
-       acl = allocate_dropping_locks(trans, ret,
-                       posix_acl_alloc(count, _gfp));
-       if (!acl)
-               return ERR_PTR(-ENOMEM);
-       if (ret) {
-               kfree(acl);
-               return ERR_PTR(ret);
-       }
-
-       out = acl->a_entries;
-
-       p = value + sizeof(bch_acl_header);
-       while (p < end) {
-               const bch_acl_entry *in = p;
-
-               out->e_tag  = le16_to_cpu(in->e_tag);
-               out->e_perm = le16_to_cpu(in->e_perm);
-
-               switch (out->e_tag) {
-               case ACL_USER_OBJ:
-               case ACL_GROUP_OBJ:
-               case ACL_MASK:
-               case ACL_OTHER:
-                       p += sizeof(bch_acl_entry_short);
-                       break;
-               case ACL_USER:
-                       out->e_uid = make_kuid(&init_user_ns,
-                                              le32_to_cpu(in->e_id));
-                       p += sizeof(bch_acl_entry);
-                       break;
-               case ACL_GROUP:
-                       out->e_gid = make_kgid(&init_user_ns,
-                                              le32_to_cpu(in->e_id));
-                       p += sizeof(bch_acl_entry);
-                       break;
-               }
-
-               out++;
-       }
-
-       BUG_ON(out != acl->a_entries + acl->a_count);
-
-       return acl;
-invalid:
-       pr_err("invalid acl entry");
-       return ERR_PTR(-EINVAL);
-}
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static struct bkey_i_xattr *
-bch2_acl_to_xattr(struct btree_trans *trans,
-                 const struct posix_acl *acl,
-                 int type)
-{
-       struct bkey_i_xattr *xattr;
-       bch_acl_header *acl_header;
-       const struct posix_acl_entry *acl_e, *pe;
-       void *outptr;
-       unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
-
-       FOREACH_ACL_ENTRY(acl_e, acl, pe) {
-               switch (acl_e->e_tag) {
-               case ACL_USER:
-               case ACL_GROUP:
-                       nr_long++;
-                       break;
-               case ACL_USER_OBJ:
-               case ACL_GROUP_OBJ:
-               case ACL_MASK:
-               case ACL_OTHER:
-                       nr_short++;
-                       break;
-               default:
-                       return ERR_PTR(-EINVAL);
-               }
-       }
-
-       acl_len = bch2_acl_size(nr_short, nr_long);
-       u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
-
-       if (u64s > U8_MAX)
-               return ERR_PTR(-E2BIG);
-
-       xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
-       if (IS_ERR(xattr))
-               return xattr;
-
-       bkey_xattr_init(&xattr->k_i);
-       xattr->k.u64s           = u64s;
-       xattr->v.x_type         = acl_to_xattr_type(type);
-       xattr->v.x_name_len     = 0;
-       xattr->v.x_val_len      = cpu_to_le16(acl_len);
-
-       acl_header = xattr_val(&xattr->v);
-       acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
-
-       outptr = (void *) acl_header + sizeof(*acl_header);
-
-       FOREACH_ACL_ENTRY(acl_e, acl, pe) {
-               bch_acl_entry *entry = outptr;
-
-               entry->e_tag = cpu_to_le16(acl_e->e_tag);
-               entry->e_perm = cpu_to_le16(acl_e->e_perm);
-               switch (acl_e->e_tag) {
-               case ACL_USER:
-                       entry->e_id = cpu_to_le32(
-                               from_kuid(&init_user_ns, acl_e->e_uid));
-                       outptr += sizeof(bch_acl_entry);
-                       break;
-               case ACL_GROUP:
-                       entry->e_id = cpu_to_le32(
-                               from_kgid(&init_user_ns, acl_e->e_gid));
-                       outptr += sizeof(bch_acl_entry);
-                       break;
-
-               case ACL_USER_OBJ:
-               case ACL_GROUP_OBJ:
-               case ACL_MASK:
-               case ACL_OTHER:
-                       outptr += sizeof(bch_acl_entry_short);
-                       break;
-               }
-       }
-
-       BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
-
-       return xattr;
-}
-
-struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
-{
-       struct bch_inode_info *inode = to_bch_ei(vinode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
-       struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
-       struct btree_iter iter = {};
-       struct posix_acl *acl = NULL;
-
-       if (rcu)
-               return ERR_PTR(-ECHILD);
-
-       struct btree_trans *trans = bch2_trans_get(c);
-retry:
-       bch2_trans_begin(trans);
-
-       struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
-                                            &hash, inode_inum(inode), &search, 0);
-       int ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-       acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
-                                le16_to_cpu(xattr.v->x_val_len));
-       ret = PTR_ERR_OR_ZERO(acl);
-err:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-
-       if (ret)
-               acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
-
-       if (!IS_ERR_OR_NULL(acl))
-               set_cached_acl(&inode->v, type, acl);
-
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-       return acl;
-}
-
-int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
-                      struct bch_inode_unpacked *inode_u,
-                      struct posix_acl *acl, int type)
-{
-       struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
-       int ret;
-
-       if (type == ACL_TYPE_DEFAULT &&
-           !S_ISDIR(inode_u->bi_mode))
-               return acl ? -EACCES : 0;
-
-       if (acl) {
-               struct bkey_i_xattr *xattr =
-                       bch2_acl_to_xattr(trans, acl, type);
-               if (IS_ERR(xattr))
-                       return PTR_ERR(xattr);
-
-               ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
-                                   inum, &xattr->k_i, 0);
-       } else {
-               struct xattr_search_key search =
-                       X_SEARCH(acl_to_xattr_type(type), "", 0);
-
-               ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
-                                      inum, &search);
-       }
-
-       return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-}
-
-int bch2_set_acl(struct mnt_idmap *idmap,
-                struct dentry *dentry,
-                struct posix_acl *_acl, int type)
-{
-       struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_iter inode_iter = {};
-       struct bch_inode_unpacked inode_u;
-       struct posix_acl *acl;
-       umode_t mode;
-       int ret;
-
-       mutex_lock(&inode->ei_update_lock);
-       struct btree_trans *trans = bch2_trans_get(c);
-retry:
-       bch2_trans_begin(trans);
-       acl = _acl;
-
-       ret   = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
-               bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
-                             BTREE_ITER_intent);
-       if (ret)
-               goto btree_err;
-
-       mode = inode_u.bi_mode;
-
-       if (type == ACL_TYPE_ACCESS) {
-               ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
-               if (ret)
-                       goto btree_err;
-       }
-
-       ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
-       if (ret)
-               goto btree_err;
-
-       inode_u.bi_ctime        = bch2_current_time(c);
-       inode_u.bi_mode         = mode;
-
-       ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
-               bch2_trans_commit(trans, NULL, NULL, 0);
-btree_err:
-       bch2_trans_iter_exit(trans, &inode_iter);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-       if (unlikely(ret))
-               goto err;
-
-       bch2_inode_update_after_write(trans, inode, &inode_u,
-                                     ATTR_CTIME|ATTR_MODE);
-
-       set_cached_acl(&inode->v, type, acl);
-err:
-       bch2_trans_put(trans);
-       mutex_unlock(&inode->ei_update_lock);
-
-       return ret;
-}
-
-int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
-                  struct bch_inode_unpacked *inode,
-                  umode_t mode,
-                  struct posix_acl **new_acl)
-{
-       struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
-       struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
-       struct btree_iter iter;
-       struct posix_acl *acl = NULL;
-
-       struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
-                              &hash_info, inum, &search, BTREE_ITER_intent);
-       int ret = bkey_err(k);
-       if (ret)
-               return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-
-       struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-
-       acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
-                       le16_to_cpu(xattr.v->x_val_len));
-       ret = PTR_ERR_OR_ZERO(acl);
-       if (ret)
-               goto err;
-
-       ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
-       if (ret)
-               goto err;
-
-       struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
-       ret = PTR_ERR_OR_ZERO(new);
-       if (ret)
-               goto err;
-
-       new->k.p = iter.pos;
-       ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
-       *new_acl = acl;
-       acl = NULL;
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       if (!IS_ERR_OR_NULL(acl))
-               kfree(acl);
-       return ret;
-}
-
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
deleted file mode 100644 (file)
index fe730a6..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ACL_H
-#define _BCACHEFS_ACL_H
-
-struct bch_inode_unpacked;
-struct bch_hash_info;
-struct bch_inode_info;
-struct posix_acl;
-
-#define BCH_ACL_VERSION        0x0001
-
-typedef struct {
-       __le16          e_tag;
-       __le16          e_perm;
-       __le32          e_id;
-} bch_acl_entry;
-
-typedef struct {
-       __le16          e_tag;
-       __le16          e_perm;
-} bch_acl_entry_short;
-
-typedef struct {
-       __le32          a_version;
-} bch_acl_header;
-
-void bch2_acl_to_text(struct printbuf *, const void *, size_t);
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
-struct posix_acl *bch2_get_acl(struct inode *, int, bool);
-
-int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
-                      struct bch_inode_unpacked *,
-                      struct posix_acl *, int);
-int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, subvol_inum,
-                  struct bch_inode_unpacked *,
-                  umode_t, struct posix_acl **);
-
-#else
-
-static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
-                                    struct bch_inode_unpacked *inode_u,
-                                    struct posix_acl *acl, int type)
-{
-       return 0;
-}
-
-static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
-                                struct bch_inode_unpacked *inode,
-                                umode_t mode,
-                                struct posix_acl **new_acl)
-{
-       return 0;
-}
-
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
-
-#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
deleted file mode 100644 (file)
index 66de463..0000000
+++ /dev/null
@@ -1,2680 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "lru.h"
-#include "recovery.h"
-#include "varint.h"
-
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/random.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-#include <linux/sort.h>
-#include <linux/jiffies.h>
-
-static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
-
-/* Persistent alloc info: */
-
-static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
-       BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bkey_alloc_unpacked {
-       u64             journal_seq;
-       u8              gen;
-       u8              oldest_gen;
-       u8              data_type;
-       bool            need_discard:1;
-       bool            need_inc_gen:1;
-#define x(_name, _bits)        u##_bits _name;
-       BCH_ALLOC_FIELDS_V2()
-#undef  x
-};
-
-static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
-                                    const void **p, unsigned field)
-{
-       unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
-       u64 v;
-
-       if (!(a->fields & (1 << field)))
-               return 0;
-
-       switch (bytes) {
-       case 1:
-               v = *((const u8 *) *p);
-               break;
-       case 2:
-               v = le16_to_cpup(*p);
-               break;
-       case 4:
-               v = le32_to_cpup(*p);
-               break;
-       case 8:
-               v = le64_to_cpup(*p);
-               break;
-       default:
-               BUG();
-       }
-
-       *p += bytes;
-       return v;
-}
-
-static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
-                                struct bkey_s_c k)
-{
-       const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
-       const void *d = in->data;
-       unsigned idx = 0;
-
-       out->gen = in->gen;
-
-#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
-       BCH_ALLOC_FIELDS_V1()
-#undef  x
-}
-
-static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
-                               struct bkey_s_c k)
-{
-       struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
-       const u8 *in = a.v->data;
-       const u8 *end = bkey_val_end(a);
-       unsigned fieldnr = 0;
-       int ret;
-       u64 v;
-
-       out->gen        = a.v->gen;
-       out->oldest_gen = a.v->oldest_gen;
-       out->data_type  = a.v->data_type;
-
-#define x(_name, _bits)                                                        \
-       if (fieldnr < a.v->nr_fields) {                                 \
-               ret = bch2_varint_decode_fast(in, end, &v);             \
-               if (ret < 0)                                            \
-                       return ret;                                     \
-               in += ret;                                              \
-       } else {                                                        \
-               v = 0;                                                  \
-       }                                                               \
-       out->_name = v;                                                 \
-       if (v != out->_name)                                            \
-               return -1;                                              \
-       fieldnr++;
-
-       BCH_ALLOC_FIELDS_V2()
-#undef  x
-       return 0;
-}
-
-static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
-                               struct bkey_s_c k)
-{
-       struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
-       const u8 *in = a.v->data;
-       const u8 *end = bkey_val_end(a);
-       unsigned fieldnr = 0;
-       int ret;
-       u64 v;
-
-       out->gen        = a.v->gen;
-       out->oldest_gen = a.v->oldest_gen;
-       out->data_type  = a.v->data_type;
-       out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
-       out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
-       out->journal_seq = le64_to_cpu(a.v->journal_seq);
-
-#define x(_name, _bits)                                                        \
-       if (fieldnr < a.v->nr_fields) {                                 \
-               ret = bch2_varint_decode_fast(in, end, &v);             \
-               if (ret < 0)                                            \
-                       return ret;                                     \
-               in += ret;                                              \
-       } else {                                                        \
-               v = 0;                                                  \
-       }                                                               \
-       out->_name = v;                                                 \
-       if (v != out->_name)                                            \
-               return -1;                                              \
-       fieldnr++;
-
-       BCH_ALLOC_FIELDS_V2()
-#undef  x
-       return 0;
-}
-
-static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
-{
-       struct bkey_alloc_unpacked ret = { .gen = 0 };
-
-       switch (k.k->type) {
-       case KEY_TYPE_alloc:
-               bch2_alloc_unpack_v1(&ret, k);
-               break;
-       case KEY_TYPE_alloc_v2:
-               bch2_alloc_unpack_v2(&ret, k);
-               break;
-       case KEY_TYPE_alloc_v3:
-               bch2_alloc_unpack_v3(&ret, k);
-               break;
-       }
-
-       return ret;
-}
-
-static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
-{
-       unsigned i, bytes = offsetof(struct bch_alloc, data);
-
-       for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
-               if (a->fields & (1 << i))
-                       bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
-
-       return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
-                          struct bkey_validate_context from)
-{
-       struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-       int ret = 0;
-
-       /* allow for unknown fields */
-       bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
-                        c, alloc_v1_val_size_bad,
-                        "incorrect value size (%zu < %u)",
-                        bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
-fsck_err:
-       return ret;
-}
-
-int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
-                          struct bkey_validate_context from)
-{
-       struct bkey_alloc_unpacked u;
-       int ret = 0;
-
-       bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
-                        c, alloc_v2_unpack_error,
-                        "unpack error");
-fsck_err:
-       return ret;
-}
-
-int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
-                          struct bkey_validate_context from)
-{
-       struct bkey_alloc_unpacked u;
-       int ret = 0;
-
-       bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
-                        c, alloc_v3_unpack_error,
-                        "unpack error");
-fsck_err:
-       return ret;
-}
-
-int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
-                          struct bkey_validate_context from)
-{
-       struct bch_alloc_v4 a;
-       int ret = 0;
-
-       bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
-
-       bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
-                        c, alloc_v4_val_size_bad,
-                        "bad val size (%u > %zu)",
-                        alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
-
-       bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
-                        BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
-                        c, alloc_v4_backpointers_start_bad,
-                        "invalid backpointers_start");
-
-       bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
-                        c, alloc_key_data_type_bad,
-                        "invalid data type (got %u should be %u)",
-                        a.data_type, alloc_data_type(a, a.data_type));
-
-       for (unsigned i = 0; i < 2; i++)
-               bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
-                                c, alloc_key_io_time_bad,
-                                "invalid io_time[%s]: %llu, max %llu",
-                                i == READ ? "read" : "write",
-                                a.io_time[i], LRU_TIME_MAX);
-
-       unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
-               offsetof(struct bch_alloc_v4, stripe_sectors)
-               ? a.stripe_sectors
-               : 0;
-
-       switch (a.data_type) {
-       case BCH_DATA_free:
-       case BCH_DATA_need_gc_gens:
-       case BCH_DATA_need_discard:
-               bkey_fsck_err_on(stripe_sectors ||
-                                a.dirty_sectors ||
-                                a.cached_sectors ||
-                                a.stripe,
-                                c, alloc_key_empty_but_have_data,
-                                "empty data type free but have data %u.%u.%u %u",
-                                stripe_sectors,
-                                a.dirty_sectors,
-                                a.cached_sectors,
-                                a.stripe);
-               break;
-       case BCH_DATA_sb:
-       case BCH_DATA_journal:
-       case BCH_DATA_btree:
-       case BCH_DATA_user:
-       case BCH_DATA_parity:
-               bkey_fsck_err_on(!a.dirty_sectors &&
-                                !stripe_sectors,
-                                c, alloc_key_dirty_sectors_0,
-                                "data_type %s but dirty_sectors==0",
-                                bch2_data_type_str(a.data_type));
-               break;
-       case BCH_DATA_cached:
-               bkey_fsck_err_on(!a.cached_sectors ||
-                                a.dirty_sectors ||
-                                stripe_sectors ||
-                                a.stripe,
-                                c, alloc_key_cached_inconsistency,
-                                "data type inconsistency");
-
-               bkey_fsck_err_on(!a.io_time[READ] &&
-                                !(c->recovery.passes_to_run &
-                                  BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)),
-                                c, alloc_key_cached_but_read_time_zero,
-                                "cached bucket with read_time == 0");
-               break;
-       case BCH_DATA_stripe:
-               break;
-       }
-fsck_err:
-       return ret;
-}
-
-void bch2_alloc_v4_swab(struct bkey_s k)
-{
-       struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
-
-       a->journal_seq_nonempty = swab64(a->journal_seq_nonempty);
-       a->journal_seq_empty    = swab64(a->journal_seq_empty);
-       a->flags                = swab32(a->flags);
-       a->dirty_sectors        = swab32(a->dirty_sectors);
-       a->cached_sectors       = swab32(a->cached_sectors);
-       a->io_time[0]           = swab64(a->io_time[0]);
-       a->io_time[1]           = swab64(a->io_time[1]);
-       a->stripe               = swab32(a->stripe);
-       a->nr_external_backpointers = swab32(a->nr_external_backpointers);
-       a->stripe_sectors       = swab32(a->stripe_sectors);
-}
-
-static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c,
-                                          unsigned dev, const struct bch_alloc_v4 *a)
-{
-       struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL;
-
-       prt_newline(out);
-       printbuf_indent_add(out, 2);
-
-       prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
-       bch2_prt_data_type(out, a->data_type);
-       prt_newline(out);
-       prt_printf(out, "journal_seq_nonempty %llu\n",  a->journal_seq_nonempty);
-       prt_printf(out, "journal_seq_empty    %llu\n",  a->journal_seq_empty);
-       prt_printf(out, "need_discard         %llu\n",  BCH_ALLOC_V4_NEED_DISCARD(a));
-       prt_printf(out, "need_inc_gen         %llu\n",  BCH_ALLOC_V4_NEED_INC_GEN(a));
-       prt_printf(out, "dirty_sectors        %u\n",    a->dirty_sectors);
-       prt_printf(out, "stripe_sectors       %u\n",    a->stripe_sectors);
-       prt_printf(out, "cached_sectors       %u\n",    a->cached_sectors);
-       prt_printf(out, "stripe               %u\n",    a->stripe);
-       prt_printf(out, "stripe_redundancy    %u\n",    a->stripe_redundancy);
-       prt_printf(out, "io_time[READ]        %llu\n",  a->io_time[READ]);
-       prt_printf(out, "io_time[WRITE]       %llu\n",  a->io_time[WRITE]);
-
-       if (ca)
-               prt_printf(out, "fragmentation     %llu\n",     alloc_lru_idx_fragmentation(*a, ca));
-       prt_printf(out, "bp_start          %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
-       printbuf_indent_sub(out, 2);
-
-       bch2_dev_put(ca);
-}
-
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bch_alloc_v4 _a;
-       const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
-
-       __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a);
-}
-
-void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-       __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v);
-}
-
-void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
-{
-       if (k.k->type == KEY_TYPE_alloc_v4) {
-               void *src, *dst;
-
-               *out = *bkey_s_c_to_alloc_v4(k).v;
-
-               src = alloc_v4_backpointers(out);
-               SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
-               dst = alloc_v4_backpointers(out);
-
-               if (src < dst)
-                       memset(src, 0, dst - src);
-
-               SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
-       } else {
-               struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
-
-               *out = (struct bch_alloc_v4) {
-                       .journal_seq_nonempty   = u.journal_seq,
-                       .flags                  = u.need_discard,
-                       .gen                    = u.gen,
-                       .oldest_gen             = u.oldest_gen,
-                       .data_type              = u.data_type,
-                       .stripe_redundancy      = u.stripe_redundancy,
-                       .dirty_sectors          = u.dirty_sectors,
-                       .cached_sectors         = u.cached_sectors,
-                       .io_time[READ]          = u.read_time,
-                       .io_time[WRITE]         = u.write_time,
-                       .stripe                 = u.stripe,
-               };
-
-               SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
-       }
-}
-
-static noinline struct bkey_i_alloc_v4 *
-__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
-       struct bkey_i_alloc_v4 *ret;
-
-       ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
-       if (IS_ERR(ret))
-               return ret;
-
-       if (k.k->type == KEY_TYPE_alloc_v4) {
-               void *src, *dst;
-
-               bkey_reassemble(&ret->k_i, k);
-
-               src = alloc_v4_backpointers(&ret->v);
-               SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
-               dst = alloc_v4_backpointers(&ret->v);
-
-               if (src < dst)
-                       memset(src, 0, dst - src);
-
-               SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
-               set_alloc_v4_u64s(ret);
-       } else {
-               bkey_alloc_v4_init(&ret->k_i);
-               ret->k.p = k.k->p;
-               bch2_alloc_to_v4(k, &ret->v);
-       }
-       return ret;
-}
-
-static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
-{
-       struct bkey_s_c_alloc_v4 a;
-
-       if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
-           ((a = bkey_s_c_to_alloc_v4(k), true) &&
-            BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
-               return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
-
-       return __bch2_alloc_to_v4_mut(trans, k);
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
-       return bch2_alloc_to_v4_mut_inlined(trans, k);
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
-                                      struct bpos pos)
-{
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
-                                              BTREE_ITER_with_updates|
-                                              BTREE_ITER_cached|
-                                              BTREE_ITER_intent);
-       int ret = bkey_err(k);
-       if (unlikely(ret))
-               return ERR_PTR(ret);
-
-       struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
-       ret = PTR_ERR_OR_ZERO(a);
-       if (unlikely(ret))
-               goto err;
-       return a;
-err:
-       bch2_trans_iter_exit(trans, iter);
-       return ERR_PTR(ret);
-}
-
-__flatten
-struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
-                                                     enum btree_iter_update_trigger_flags flags)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos,
-                                              BTREE_ITER_with_updates|
-                                              BTREE_ITER_cached|
-                                              BTREE_ITER_intent);
-       int ret = bkey_err(k);
-       if (unlikely(ret))
-               return ERR_PTR(ret);
-
-       if ((void *) k.v >= trans->mem &&
-           (void *) k.v <  trans->mem + trans->mem_top) {
-               bch2_trans_iter_exit(trans, &iter);
-               return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v);
-       }
-
-       struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
-       if (IS_ERR(a)) {
-               bch2_trans_iter_exit(trans, &iter);
-               return a;
-       }
-
-       ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_);
-       bch2_trans_iter_exit(trans, &iter);
-       return unlikely(ret) ? ERR_PTR(ret) : a;
-}
-
-static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
-{
-       *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
-
-       pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
-       return pos;
-}
-
-static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
-{
-       pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
-       pos.offset += offset;
-       return pos;
-}
-
-static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
-{
-       return k.k->type == KEY_TYPE_bucket_gens
-               ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
-               : 0;
-}
-
-int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
-                             struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
-                        c, bucket_gens_val_size_bad,
-                        "bad val size (%zu != %zu)",
-                        bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
-fsck_err:
-       return ret;
-}
-
-void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
-               if (i)
-                       prt_char(out, ' ');
-               prt_printf(out, "%u", g.v->gens[i]);
-       }
-}
-
-int bch2_bucket_gens_init(struct bch_fs *c)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct bkey_i_bucket_gens g;
-       bool have_bucket_gens_key = false;
-       int ret;
-
-       ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-                                BTREE_ITER_prefetch, k, ({
-               /*
-                * Not a fsck error because this is checked/repaired by
-                * bch2_check_alloc_key() which runs later:
-                */
-               if (!bch2_dev_bucket_exists(c, k.k->p))
-                       continue;
-
-               struct bch_alloc_v4 a;
-               u8 gen = bch2_alloc_to_v4(k, &a)->gen;
-               unsigned offset;
-               struct bpos pos = alloc_gens_pos(iter.pos, &offset);
-               int ret2 = 0;
-
-               if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
-                       ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
-                               bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-                       if (ret2)
-                               goto iter_err;
-                       have_bucket_gens_key = false;
-               }
-
-               if (!have_bucket_gens_key) {
-                       bkey_bucket_gens_init(&g.k_i);
-                       g.k.p = pos;
-                       have_bucket_gens_key = true;
-               }
-
-               g.v.gens[offset] = gen;
-iter_err:
-               ret2;
-       }));
-
-       if (have_bucket_gens_key && !ret)
-               ret = commit_do(trans, NULL, NULL,
-                               BCH_TRANS_COMMIT_no_enospc,
-                       bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
-
-       bch2_trans_put(trans);
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-int bch2_alloc_read(struct bch_fs *c)
-{
-       down_read(&c->state_lock);
-
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct bch_dev *ca = NULL;
-       int ret;
-
-       if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
-               ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
-                                        BTREE_ITER_prefetch, k, ({
-                       u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
-                       u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
-
-                       if (k.k->type != KEY_TYPE_bucket_gens)
-                               continue;
-
-                       ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-                       /*
-                        * Not a fsck error because this is checked/repaired by
-                        * bch2_check_alloc_key() which runs later:
-                        */
-                       if (!ca) {
-                               bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
-                               continue;
-                       }
-
-                       const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
-
-                       for (u64 b = max_t(u64, ca->mi.first_bucket, start);
-                            b < min_t(u64, ca->mi.nbuckets, end);
-                            b++)
-                               *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
-                       0;
-               }));
-       } else {
-               ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-                                        BTREE_ITER_prefetch, k, ({
-                       ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-                       /*
-                        * Not a fsck error because this is checked/repaired by
-                        * bch2_check_alloc_key() which runs later:
-                        */
-                       if (!ca) {
-                               bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
-                               continue;
-                       }
-
-                       if (k.k->p.offset < ca->mi.first_bucket) {
-                               bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket));
-                               continue;
-                       }
-
-                       if (k.k->p.offset >= ca->mi.nbuckets) {
-                               bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
-                               continue;
-                       }
-
-                       struct bch_alloc_v4 a;
-                       *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
-                       0;
-               }));
-       }
-
-       bch2_dev_put(ca);
-       bch2_trans_put(trans);
-
-       up_read(&c->state_lock);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-/* Free space/discard btree: */
-
-static int __need_discard_or_freespace_err(struct btree_trans *trans,
-                                          struct bkey_s_c alloc_k,
-                                          bool set, bool discard, bool repair)
-{
-       struct bch_fs *c = trans->c;
-       enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
-       enum bch_sb_error_id err_id = discard
-               ? BCH_FSCK_ERR_need_discard_key_wrong
-               : BCH_FSCK_ERR_freespace_key_wrong;
-       enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
-       struct printbuf buf = PRINTBUF;
-
-       bch2_bkey_val_to_text(&buf, c, alloc_k);
-
-       int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
-                                 "bucket incorrectly %sset in %s btree\n%s",
-                                 set ? "" : "un",
-                                 bch2_btree_id_str(btree),
-                                 buf.buf);
-       if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) ||
-           bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed))
-               ret = 0;
-
-       printbuf_exit(&buf);
-       return ret;
-}
-
-#define need_discard_or_freespace_err(...)             \
-       fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
-
-#define need_discard_or_freespace_err_on(cond, ...)            \
-       (unlikely(cond) ?  need_discard_or_freespace_err(__VA_ARGS__) : false)
-
-static int bch2_bucket_do_index(struct btree_trans *trans,
-                               struct bch_dev *ca,
-                               struct bkey_s_c alloc_k,
-                               const struct bch_alloc_v4 *a,
-                               bool set)
-{
-       enum btree_id btree;
-       struct bpos pos;
-
-       if (a->data_type != BCH_DATA_free &&
-           a->data_type != BCH_DATA_need_discard)
-               return 0;
-
-       switch (a->data_type) {
-       case BCH_DATA_free:
-               btree = BTREE_ID_freespace;
-               pos = alloc_freespace_pos(alloc_k.k->p, *a);
-               break;
-       case BCH_DATA_need_discard:
-               btree = BTREE_ID_need_discard;
-               pos = alloc_k.k->p;
-               break;
-       default:
-               return 0;
-       }
-
-       struct btree_iter iter;
-       struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
-       int ret = bkey_err(old);
-       if (ret)
-               return ret;
-
-       need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
-                                        !old.k->type != set,
-                                        trans, alloc_k, set,
-                                        btree == BTREE_ID_need_discard, false);
-
-       ret = bch2_btree_bit_mod_iter(trans, &iter, set);
-fsck_err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
-                                          struct bpos bucket, u8 gen)
-{
-       struct btree_iter iter;
-       unsigned offset;
-       struct bpos pos = alloc_gens_pos(bucket, &offset);
-       struct bkey_i_bucket_gens *g;
-       struct bkey_s_c k;
-       int ret;
-
-       g = bch2_trans_kmalloc(trans, sizeof(*g));
-       ret = PTR_ERR_OR_ZERO(g);
-       if (ret)
-               return ret;
-
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
-                              BTREE_ITER_intent|
-                              BTREE_ITER_with_updates);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       if (k.k->type != KEY_TYPE_bucket_gens) {
-               bkey_bucket_gens_init(&g->k_i);
-               g->k.p = iter.pos;
-       } else {
-               bkey_reassemble(&g->k_i, k);
-       }
-
-       g->v.gens[offset] = gen;
-
-       ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
-                                                   enum bch_data_type data_type,
-                                                   s64 delta_buckets,
-                                                   s64 delta_sectors,
-                                                   s64 delta_fragmented, unsigned flags)
-{
-       s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
-
-       return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
-                                        d, dev_data_type,
-                                        .dev           = ca->dev_idx,
-                                        .data_type     = data_type);
-}
-
-int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
-                                  const struct bch_alloc_v4 *old,
-                                  const struct bch_alloc_v4 *new,
-                                  unsigned flags)
-{
-       s64 old_sectors = bch2_bucket_sectors(*old);
-       s64 new_sectors = bch2_bucket_sectors(*new);
-       if (old->data_type != new->data_type) {
-               int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
-                                1,  new_sectors,  bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
-                         bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
-                               -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
-               if (ret)
-                       return ret;
-       } else if (old_sectors != new_sectors) {
-               int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
-                                        0,
-                                        new_sectors - old_sectors,
-                                        bch2_bucket_sectors_fragmented(ca, *new) -
-                                        bch2_bucket_sectors_fragmented(ca, *old), flags);
-               if (ret)
-                       return ret;
-       }
-
-       s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
-       s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
-       if (old_unstriped != new_unstriped) {
-               int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
-                                        !!new_unstriped - !!old_unstriped,
-                                        new_unstriped - old_unstriped,
-                                        0,
-                                        flags);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-int bch2_trigger_alloc(struct btree_trans *trans,
-                      enum btree_id btree, unsigned level,
-                      struct bkey_s_c old, struct bkey_s new,
-                      enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
-       if (!ca)
-               return bch_err_throw(c, trigger_alloc);
-
-       struct bch_alloc_v4 old_a_convert;
-       const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
-
-       struct bch_alloc_v4 *new_a;
-       if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
-               new_a = bkey_s_to_alloc_v4(new).v;
-       } else {
-               BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
-
-               struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
-               ret = PTR_ERR_OR_ZERO(new_ka);
-               if (unlikely(ret))
-                       goto err;
-               new_a = &new_ka->v;
-       }
-
-       if (flags & BTREE_TRIGGER_transactional) {
-               alloc_data_type_set(new_a, new_a->data_type);
-
-               int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
-                                    (int) data_type_is_empty(old_a->data_type);
-
-               if (is_empty_delta < 0) {
-                       new_a->io_time[READ] = bch2_current_io_time(c, READ);
-                       new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
-                       SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
-                       SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
-               }
-
-               if (data_type_is_empty(new_a->data_type) &&
-                   BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
-                   !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
-                       if (new_a->oldest_gen == new_a->gen &&
-                           !bch2_bucket_sectors_total(*new_a))
-                               new_a->oldest_gen++;
-                       new_a->gen++;
-                       SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
-                       alloc_data_type_set(new_a, new_a->data_type);
-               }
-
-               if (old_a->data_type != new_a->data_type ||
-                   (new_a->data_type == BCH_DATA_free &&
-                    alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
-                       ret =   bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
-                               bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
-                       if (ret)
-                               goto err;
-               }
-
-               if (new_a->data_type == BCH_DATA_cached &&
-                   !new_a->io_time[READ])
-                       new_a->io_time[READ] = bch2_current_io_time(c, READ);
-
-               ret = bch2_lru_change(trans, new.k->p.inode,
-                                     bucket_to_u64(new.k->p),
-                                     alloc_lru_idx_read(*old_a),
-                                     alloc_lru_idx_read(*new_a));
-               if (ret)
-                       goto err;
-
-               ret = bch2_lru_change(trans,
-                                     BCH_LRU_BUCKET_FRAGMENTATION,
-                                     bucket_to_u64(new.k->p),
-                                     alloc_lru_idx_fragmentation(*old_a, ca),
-                                     alloc_lru_idx_fragmentation(*new_a, ca));
-               if (ret)
-                       goto err;
-
-               if (old_a->gen != new_a->gen) {
-                       ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
-                       if (ret)
-                               goto err;
-               }
-
-               ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
-               if (ret)
-                       goto err;
-       }
-
-       if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
-               u64 transaction_seq = trans->journal_res.seq;
-               BUG_ON(!transaction_seq);
-
-               if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
-                                   trans, alloc_key_journal_seq_in_future,
-                                   "bucket journal seq in future (currently at %llu)\n%s",
-                                   journal_cur_seq(&c->journal),
-                                   (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
-                       new_a->journal_seq_nonempty = transaction_seq;
-
-               int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
-                                    (int) data_type_is_empty(old_a->data_type);
-
-               /*
-                * Record journal sequence number of empty -> nonempty transition:
-                * Note that there may be multiple empty -> nonempty
-                * transitions, data in a bucket may be overwritten while we're
-                * still writing to it - so be careful to only record the first:
-                * */
-               if (is_empty_delta < 0 &&
-                   new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
-                       new_a->journal_seq_nonempty     = transaction_seq;
-                       new_a->journal_seq_empty        = 0;
-               }
-
-               /*
-                * Bucket becomes empty: mark it as waiting for a journal flush,
-                * unless updates since empty -> nonempty transition were never
-                * flushed - we may need to ask the journal not to flush
-                * intermediate sequence numbers:
-                */
-               if (is_empty_delta > 0) {
-                       if (new_a->journal_seq_nonempty == transaction_seq ||
-                           bch2_journal_noflush_seq(&c->journal,
-                                                    new_a->journal_seq_nonempty,
-                                                    transaction_seq)) {
-                               new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
-                       } else {
-                               new_a->journal_seq_empty = transaction_seq;
-
-                               ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-                                                                          c->journal.flushed_seq_ondisk,
-                                                                          new.k->p.inode, new.k->p.offset,
-                                                                          transaction_seq);
-                               if (bch2_fs_fatal_err_on(ret, c,
-                                               "setting bucket_needs_journal_commit: %s",
-                                               bch2_err_str(ret)))
-                                       goto err;
-                       }
-               }
-
-               if (new_a->gen != old_a->gen) {
-                       guard(rcu)();
-                       u8 *gen = bucket_gen(ca, new.k->p.offset);
-                       if (unlikely(!gen))
-                               goto invalid_bucket;
-                       *gen = new_a->gen;
-               }
-
-#define eval_state(_a, expr)           ({ const struct bch_alloc_v4 *a = _a; expr; })
-#define statechange(expr)              !eval_state(old_a, expr) && eval_state(new_a, expr)
-#define bucket_flushed(a)              (a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
-
-               if (statechange(a->data_type == BCH_DATA_free) &&
-                   bucket_flushed(new_a))
-                       closure_wake_up(&c->freelist_wait);
-
-               if (statechange(a->data_type == BCH_DATA_need_discard) &&
-                   !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
-                   bucket_flushed(new_a))
-                       bch2_discard_one_bucket_fast(ca, new.k->p.offset);
-
-               if (statechange(a->data_type == BCH_DATA_cached) &&
-                   !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
-                   should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
-                       bch2_dev_do_invalidates(ca);
-
-               if (statechange(a->data_type == BCH_DATA_need_gc_gens))
-                       bch2_gc_gens_async(c);
-       }
-
-       if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
-               guard(rcu)();
-               struct bucket *g = gc_bucket(ca, new.k->p.offset);
-               if (unlikely(!g))
-                       goto invalid_bucket;
-               g->gen_valid    = 1;
-               g->gen          = new_a->gen;
-       }
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       bch2_dev_put(ca);
-       return ret;
-invalid_bucket:
-       bch2_fs_inconsistent(c, "reference to invalid bucket\n%s",
-                            (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
-       ret = bch_err_throw(c, trigger_alloc);
-       goto err;
-}
-
-/*
- * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
- * extents style btrees, but works on non-extents btrees:
- */
-static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter,
-                                           struct bpos end, struct bkey *hole)
-{
-       struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
-
-       if (bkey_err(k))
-               return k;
-
-       if (k.k->type) {
-               return k;
-       } else {
-               struct btree_iter iter2;
-               struct bpos next;
-
-               bch2_trans_copy_iter(trans, &iter2, iter);
-
-               struct btree_path *path = btree_iter_path(trans, iter);
-               if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
-                       end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
-
-               end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
-
-               /*
-                * btree node min/max is a closed interval, upto takes a half
-                * open interval:
-                */
-               k = bch2_btree_iter_peek_max(trans, &iter2, end);
-               next = iter2.pos;
-               bch2_trans_iter_exit(trans, &iter2);
-
-               BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
-
-               if (bkey_err(k))
-                       return k;
-
-               bkey_init(hole);
-               hole->p = iter->pos;
-
-               bch2_key_resize(hole, next.offset - iter->pos.offset);
-               return (struct bkey_s_c) { hole, NULL };
-       }
-}
-
-static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
-{
-       if (*ca) {
-               if (bucket->offset < (*ca)->mi.first_bucket)
-                       bucket->offset = (*ca)->mi.first_bucket;
-
-               if (bucket->offset < (*ca)->mi.nbuckets)
-                       return true;
-
-               bch2_dev_put(*ca);
-               *ca = NULL;
-               bucket->inode++;
-               bucket->offset = 0;
-       }
-
-       guard(rcu)();
-       *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
-       if (*ca) {
-               *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
-               bch2_dev_get(*ca);
-       }
-
-       return *ca != NULL;
-}
-
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans,
-                                                       struct btree_iter *iter,
-                                                       struct bch_dev **ca, struct bkey *hole)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_s_c k;
-again:
-       k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole);
-       if (bkey_err(k))
-               return k;
-
-       *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
-
-       if (!k.k->type) {
-               struct bpos hole_start = bkey_start_pos(k.k);
-
-               if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
-                       if (!next_bucket(c, ca, &hole_start))
-                               return bkey_s_c_null;
-
-                       bch2_btree_iter_set_pos(trans, iter, hole_start);
-                       goto again;
-               }
-
-               if (k.k->p.offset > (*ca)->mi.nbuckets)
-                       bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
-       }
-
-       return k;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_key(struct btree_trans *trans,
-                        struct bkey_s_c alloc_k,
-                        struct btree_iter *alloc_iter,
-                        struct btree_iter *discard_iter,
-                        struct btree_iter *freespace_iter,
-                        struct btree_iter *bucket_gens_iter)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_alloc_v4 a_convert;
-       const struct bch_alloc_v4 *a;
-       unsigned gens_offset;
-       struct bkey_s_c k;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
-       if (fsck_err_on(!ca,
-                       trans, alloc_key_to_missing_dev_bucket,
-                       "alloc key for invalid device:bucket %llu:%llu",
-                       alloc_k.k->p.inode, alloc_k.k->p.offset))
-               ret = bch2_btree_delete_at(trans, alloc_iter, 0);
-       if (!ca)
-               return ret;
-
-       if (!ca->mi.freespace_initialized)
-               goto out;
-
-       a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-       bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p);
-       k = bch2_btree_iter_peek_slot(trans, discard_iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       bool is_discarded = a->data_type == BCH_DATA_need_discard;
-       if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
-                                            trans, alloc_k, !is_discarded, true, true)) {
-               ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
-               if (ret)
-                       goto err;
-       }
-
-       bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
-       k = bch2_btree_iter_peek_slot(trans, freespace_iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       bool is_free = a->data_type == BCH_DATA_free;
-       if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
-                                            trans, alloc_k, !is_free, false, true)) {
-               ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
-               if (ret)
-                       goto err;
-       }
-
-       bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
-       k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
-                       trans, bucket_gens_key_wrong,
-                       "incorrect gen in bucket_gens btree (got %u should be %u)\n%s",
-                       alloc_gen(k, gens_offset), a->gen,
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-               struct bkey_i_bucket_gens *g =
-                       bch2_trans_kmalloc(trans, sizeof(*g));
-
-               ret = PTR_ERR_OR_ZERO(g);
-               if (ret)
-                       goto err;
-
-               if (k.k->type == KEY_TYPE_bucket_gens) {
-                       bkey_reassemble(&g->k_i, k);
-               } else {
-                       bkey_bucket_gens_init(&g->k_i);
-                       g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
-               }
-
-               g->v.gens[gens_offset] = a->gen;
-
-               ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
-               if (ret)
-                       goto err;
-       }
-out:
-err:
-fsck_err:
-       bch2_dev_put(ca);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
-                                   struct bch_dev *ca,
-                                   struct bpos start,
-                                   struct bpos *end,
-                                   struct btree_iter *freespace_iter)
-{
-       struct bkey_s_c k;
-       struct printbuf buf = PRINTBUF;
-       int ret;
-
-       if (!ca->mi.freespace_initialized)
-               return 0;
-
-       bch2_btree_iter_set_pos(trans, freespace_iter, start);
-
-       k = bch2_btree_iter_peek_slot(trans, freespace_iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       *end = bkey_min(k.k->p, *end);
-
-       if (fsck_err_on(k.k->type != KEY_TYPE_set,
-                       trans, freespace_hole_missing,
-                       "hole in alloc btree missing in freespace btree\n"
-                       "device %llu buckets %llu-%llu",
-                       freespace_iter->pos.inode,
-                       freespace_iter->pos.offset,
-                       end->offset)) {
-               struct bkey_i *update =
-                       bch2_trans_kmalloc(trans, sizeof(*update));
-
-               ret = PTR_ERR_OR_ZERO(update);
-               if (ret)
-                       goto err;
-
-               bkey_init(&update->k);
-               update->k.type  = KEY_TYPE_set;
-               update->k.p     = freespace_iter->pos;
-               bch2_key_resize(&update->k,
-                               min_t(u64, U32_MAX, end->offset -
-                                     freespace_iter->pos.offset));
-
-               ret = bch2_trans_update(trans, freespace_iter, update, 0);
-               if (ret)
-                       goto err;
-       }
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
-                                     struct bpos start,
-                                     struct bpos *end,
-                                     struct btree_iter *bucket_gens_iter)
-{
-       struct bkey_s_c k;
-       struct printbuf buf = PRINTBUF;
-       unsigned i, gens_offset, gens_end_offset;
-       int ret;
-
-       bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
-
-       k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
-                    alloc_gens_pos(*end,  &gens_end_offset)))
-               gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
-
-       if (k.k->type == KEY_TYPE_bucket_gens) {
-               struct bkey_i_bucket_gens g;
-               bool need_update = false;
-
-               bkey_reassemble(&g.k_i, k);
-
-               for (i = gens_offset; i < gens_end_offset; i++) {
-                       if (fsck_err_on(g.v.gens[i], trans,
-                                       bucket_gens_hole_wrong,
-                                       "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
-                                       bucket_gens_pos_to_alloc(k.k->p, i).inode,
-                                       bucket_gens_pos_to_alloc(k.k->p, i).offset,
-                                       g.v.gens[i])) {
-                               g.v.gens[i] = 0;
-                               need_update = true;
-                       }
-               }
-
-               if (need_update) {
-                       struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
-
-                       ret = PTR_ERR_OR_ZERO(u);
-                       if (ret)
-                               goto err;
-
-                       memcpy(u, &g, sizeof(g));
-
-                       ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
-                       if (ret)
-                               goto err;
-               }
-       }
-
-       *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-struct check_discard_freespace_key_async {
-       struct work_struct      work;
-       struct bch_fs           *c;
-       struct bbpos            pos;
-};
-
-static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       u8 gen;
-       ret = k.k->type != KEY_TYPE_set
-               ? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
-               : 0;
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static void check_discard_freespace_key_work(struct work_struct *work)
-{
-       struct check_discard_freespace_key_async *w =
-               container_of(work, struct check_discard_freespace_key_async, work);
-
-       bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
-       enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key);
-       kfree(w);
-}
-
-int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
-                                    bool async_repair)
-{
-       struct bch_fs *c = trans->c;
-       enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
-               ? BCH_DATA_need_discard
-               : BCH_DATA_free;
-       struct printbuf buf = PRINTBUF;
-
-       unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)|
-               FSCK_CAN_FIX|FSCK_CAN_IGNORE;
-
-       struct bpos bucket = iter->pos;
-       bucket.offset &= ~(~0ULL << 56);
-       u64 genbits = iter->pos.offset & (~0ULL << 56);
-
-       struct btree_iter alloc_iter;
-       struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
-                                                    BTREE_ID_alloc, bucket,
-                                                    async_repair ? BTREE_ITER_cached : 0);
-       int ret = bkey_err(alloc_k);
-       if (ret)
-               return ret;
-
-       if (!bch2_dev_bucket_exists(c, bucket)) {
-               if (__fsck_err(trans, fsck_flags,
-                              need_discard_freespace_key_to_invalid_dev_bucket,
-                              "entry in %s btree for nonexistant dev:bucket %llu:%llu",
-                              bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
-                       goto delete;
-               ret = 1;
-               goto out;
-       }
-
-       struct bch_alloc_v4 a_convert;
-       const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-       if (a->data_type != state ||
-           (state == BCH_DATA_free &&
-            genbits != alloc_freespace_genbits(*a))) {
-               if (__fsck_err(trans, fsck_flags,
-                              need_discard_freespace_key_bad,
-                            "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
-                            (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
-                            bch2_btree_id_str(iter->btree_id),
-                            iter->pos.inode,
-                            iter->pos.offset,
-                            a->data_type == state,
-                            genbits >> 56, alloc_freespace_genbits(*a) >> 56))
-                       goto delete;
-               ret = 1;
-               goto out;
-       }
-
-       *gen = a->gen;
-out:
-fsck_err:
-       bch2_set_btree_iter_dontneed(trans, &alloc_iter);
-       bch2_trans_iter_exit(trans, &alloc_iter);
-       printbuf_exit(&buf);
-       return ret;
-delete:
-       if (!async_repair) {
-               ret =   bch2_btree_bit_mod_iter(trans, iter, false) ?:
-                       bch2_trans_commit(trans, NULL, NULL,
-                               BCH_TRANS_COMMIT_no_enospc) ?:
-                       bch_err_throw(c, transaction_restart_commit);
-               goto out;
-       } else {
-               /*
-                * We can't repair here when called from the allocator path: the
-                * commit will recurse back into the allocator
-                */
-               struct check_discard_freespace_key_async *w =
-                       kzalloc(sizeof(*w), GFP_KERNEL);
-               if (!w)
-                       goto out;
-
-               if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) {
-                       kfree(w);
-                       goto out;
-               }
-
-               INIT_WORK(&w->work, check_discard_freespace_key_work);
-               w->c = c;
-               w->pos = BBPOS(iter->btree_id, iter->pos);
-               queue_work(c->write_ref_wq, &w->work);
-
-               ret = 1; /* don't allocate from this bucket */
-               goto out;
-       }
-}
-
-static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
-{
-       u8 gen;
-       int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
-       return ret < 0 ? ret : 0;
-}
-
-/*
- * We've already checked that generation numbers in the bucket_gens btree are
- * valid for buckets that exist; this just checks for keys for nonexistent
- * buckets.
- */
-static noinline_for_stack
-int bch2_check_bucket_gens_key(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_i_bucket_gens g;
-       u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
-       u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
-       u64 b;
-       bool need_update = false;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
-       bkey_reassemble(&g.k_i, k);
-
-       struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
-       if (!ca) {
-               if (fsck_err(trans, bucket_gens_to_invalid_dev,
-                            "bucket_gens key for invalid device:\n%s",
-                            (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-                       ret = bch2_btree_delete_at(trans, iter, 0);
-               goto out;
-       }
-
-       if (fsck_err_on(end <= ca->mi.first_bucket ||
-                       start >= ca->mi.nbuckets,
-                       trans, bucket_gens_to_invalid_buckets,
-                       "bucket_gens key for invalid buckets:\n%s",
-                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               ret = bch2_btree_delete_at(trans, iter, 0);
-               goto out;
-       }
-
-       for (b = start; b < ca->mi.first_bucket; b++)
-               if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
-                               trans, bucket_gens_nonzero_for_invalid_buckets,
-                               "bucket_gens key has nonzero gen for invalid bucket")) {
-                       g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
-                       need_update = true;
-               }
-
-       for (b = ca->mi.nbuckets; b < end; b++)
-               if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
-                               trans, bucket_gens_nonzero_for_invalid_buckets,
-                               "bucket_gens key has nonzero gen for invalid bucket")) {
-                       g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
-                       need_update = true;
-               }
-
-       if (need_update) {
-               struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
-
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       goto out;
-
-               memcpy(u, &g, sizeof(g));
-               ret = bch2_trans_update(trans, iter, u, 0);
-       }
-out:
-fsck_err:
-       bch2_dev_put(ca);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_check_alloc_info(struct bch_fs *c)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
-       struct bch_dev *ca = NULL;
-       struct bkey hole;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
-                            BTREE_ITER_prefetch);
-       bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
-                            BTREE_ITER_prefetch);
-       bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
-                            BTREE_ITER_prefetch);
-       bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
-                            BTREE_ITER_prefetch);
-
-       while (1) {
-               struct bpos next;
-
-               bch2_trans_begin(trans);
-
-               k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole);
-               ret = bkey_err(k);
-               if (ret)
-                       goto bkey_err;
-
-               if (!k.k)
-                       break;
-
-               if (k.k->type) {
-                       next = bpos_nosnap_successor(k.k->p);
-
-                       ret = bch2_check_alloc_key(trans,
-                                                  k, &iter,
-                                                  &discard_iter,
-                                                  &freespace_iter,
-                                                  &bucket_gens_iter);
-                       if (ret)
-                               goto bkey_err;
-               } else {
-                       next = k.k->p;
-
-                       ret = bch2_check_alloc_hole_freespace(trans, ca,
-                                                   bkey_start_pos(k.k),
-                                                   &next,
-                                                   &freespace_iter) ?:
-                               bch2_check_alloc_hole_bucket_gens(trans,
-                                                   bkey_start_pos(k.k),
-                                                   &next,
-                                                   &bucket_gens_iter);
-                       if (ret)
-                               goto bkey_err;
-               }
-
-               ret = bch2_trans_commit(trans, NULL, NULL,
-                                       BCH_TRANS_COMMIT_no_enospc);
-               if (ret)
-                       goto bkey_err;
-
-               bch2_btree_iter_set_pos(trans, &iter, next);
-bkey_err:
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &bucket_gens_iter);
-       bch2_trans_iter_exit(trans, &freespace_iter);
-       bch2_trans_iter_exit(trans, &discard_iter);
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_dev_put(ca);
-       ca = NULL;
-
-       if (ret < 0)
-               goto err;
-
-       ret = for_each_btree_key(trans, iter,
-                       BTREE_ID_need_discard, POS_MIN,
-                       BTREE_ITER_prefetch, k,
-               bch2_check_discard_freespace_key_fsck(trans, &iter));
-       if (ret)
-               goto err;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
-                            BTREE_ITER_prefetch);
-       while (1) {
-               bch2_trans_begin(trans);
-               k = bch2_btree_iter_peek(trans, &iter);
-               if (!k.k)
-                       break;
-
-               ret = bkey_err(k) ?:
-                       bch2_check_discard_freespace_key_fsck(trans, &iter);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-                       ret = 0;
-                       continue;
-               }
-               if (ret) {
-                       struct printbuf buf = PRINTBUF;
-                       bch2_bkey_val_to_text(&buf, c, k);
-
-                       bch_err(c, "while checking %s", buf.buf);
-                       printbuf_exit(&buf);
-                       break;
-               }
-
-               bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos));
-       }
-       bch2_trans_iter_exit(trans, &iter);
-       if (ret)
-               goto err;
-
-       ret = for_each_btree_key_commit(trans, iter,
-                       BTREE_ID_bucket_gens, POS_MIN,
-                       BTREE_ITER_prefetch, k,
-                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-               bch2_check_bucket_gens_key(trans, &iter, k));
-err:
-       bch2_trans_put(trans);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
-                                      struct btree_iter *alloc_iter,
-                                      struct bkey_buf *last_flushed)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_alloc_v4 a_convert;
-       const struct bch_alloc_v4 *a;
-       struct bkey_s_c alloc_k;
-       struct printbuf buf = PRINTBUF;
-       int ret;
-
-       alloc_k = bch2_btree_iter_peek(trans, alloc_iter);
-       if (!alloc_k.k)
-               return 0;
-
-       ret = bkey_err(alloc_k);
-       if (ret)
-               return ret;
-
-       struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode);
-       if (!ca)
-               return 0;
-
-       a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-       u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
-       if (lru_idx) {
-               ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION,
-                                        bucket_to_u64(alloc_k.k->p),
-                                        lru_idx, alloc_k, last_flushed);
-               if (ret)
-                       goto err;
-       }
-
-       if (a->data_type != BCH_DATA_cached)
-               goto err;
-
-       if (fsck_err_on(!a->io_time[READ],
-                       trans, alloc_key_cached_but_read_time_zero,
-                       "cached bucket with read_time 0\n%s",
-               (printbuf_reset(&buf),
-                bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-               struct bkey_i_alloc_v4 *a_mut =
-                       bch2_alloc_to_v4_mut(trans, alloc_k);
-               ret = PTR_ERR_OR_ZERO(a_mut);
-               if (ret)
-                       goto err;
-
-               a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
-               ret = bch2_trans_update(trans, alloc_iter,
-                                       &a_mut->k_i, BTREE_TRIGGER_norun);
-               if (ret)
-                       goto err;
-
-               a = &a_mut->v;
-       }
-
-       ret = bch2_lru_check_set(trans, alloc_k.k->p.inode,
-                                bucket_to_u64(alloc_k.k->p),
-                                a->io_time[READ],
-                                alloc_k, last_flushed);
-       if (ret)
-               goto err;
-err:
-fsck_err:
-       bch2_dev_put(ca);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
-{
-       struct bkey_buf last_flushed;
-
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
-                               POS_MIN, BTREE_ITER_prefetch, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?:
-               bch2_check_stripe_to_lru_refs(c);
-
-       bch2_bkey_buf_exit(&last_flushed, c);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
-{
-       struct bch_fs *c = ca->fs;
-       int ret;
-
-       mutex_lock(&ca->discard_buckets_in_flight_lock);
-       struct discard_in_flight *i =
-               darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
-       if (i) {
-               ret = bch_err_throw(c, EEXIST_discard_in_flight_add);
-               goto out;
-       }
-
-       ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
-                          .in_progress = in_progress,
-                          .bucket      = bucket,
-       }));
-out:
-       mutex_unlock(&ca->discard_buckets_in_flight_lock);
-       return ret;
-}
-
-static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
-{
-       mutex_lock(&ca->discard_buckets_in_flight_lock);
-       struct discard_in_flight *i =
-               darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
-       BUG_ON(!i || !i->in_progress);
-
-       darray_remove_item(&ca->discard_buckets_in_flight, i);
-       mutex_unlock(&ca->discard_buckets_in_flight_lock);
-}
-
-struct discard_buckets_state {
-       u64             seen;
-       u64             open;
-       u64             need_journal_commit;
-       u64             discarded;
-};
-
-static int bch2_discard_one_bucket(struct btree_trans *trans,
-                                  struct bch_dev *ca,
-                                  struct btree_iter *need_discard_iter,
-                                  struct bpos *discard_pos_done,
-                                  struct discard_buckets_state *s,
-                                  bool fastpath)
-{
-       struct bch_fs *c = trans->c;
-       struct bpos pos = need_discard_iter->pos;
-       struct btree_iter iter = {};
-       struct bkey_s_c k;
-       struct bkey_i_alloc_v4 *a;
-       struct printbuf buf = PRINTBUF;
-       bool discard_locked = false;
-       int ret = 0;
-
-       if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
-               s->open++;
-               goto out;
-       }
-
-       u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
-                                                     pos.inode, pos.offset);
-       if (seq_ready > c->journal.flushed_seq_ondisk) {
-               if (seq_ready > c->journal.flushing_seq)
-                       s->need_journal_commit++;
-               goto out;
-       }
-
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
-                              need_discard_iter->pos,
-                              BTREE_ITER_cached);
-       ret = bkey_err(k);
-       if (ret)
-               goto out;
-
-       a = bch2_alloc_to_v4_mut(trans, k);
-       ret = PTR_ERR_OR_ZERO(a);
-       if (ret)
-               goto out;
-
-       if (a->v.data_type != BCH_DATA_need_discard) {
-               if (need_discard_or_freespace_err(trans, k, true, true, true)) {
-                       ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
-                       if (ret)
-                               goto out;
-                       goto commit;
-               }
-
-               goto out;
-       }
-
-       if (!fastpath) {
-               if (discard_in_flight_add(ca, iter.pos.offset, true))
-                       goto out;
-
-               discard_locked = true;
-       }
-
-       if (!bkey_eq(*discard_pos_done, iter.pos)) {
-               s->discarded++;
-               *discard_pos_done = iter.pos;
-
-               if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) {
-                       /*
-                        * This works without any other locks because this is the only
-                        * thread that removes items from the need_discard tree
-                        */
-                       bch2_trans_unlock_long(trans);
-                       blkdev_issue_discard(ca->disk_sb.bdev,
-                                            k.k->p.offset * ca->mi.bucket_size,
-                                            ca->mi.bucket_size,
-                                            GFP_KERNEL);
-                       ret = bch2_trans_relock_notrace(trans);
-                       if (ret)
-                               goto out;
-               }
-       }
-
-       SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
-       alloc_data_type_set(&a->v, a->v.data_type);
-
-       ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-       if (ret)
-               goto out;
-commit:
-       ret = bch2_trans_commit(trans, NULL, NULL,
-                               BCH_WATERMARK_btree|
-                               BCH_TRANS_COMMIT_no_enospc);
-       if (ret)
-               goto out;
-
-       if (!fastpath)
-               count_event(c, bucket_discard);
-       else
-               count_event(c, bucket_discard_fast);
-out:
-fsck_err:
-       if (discard_locked)
-               discard_in_flight_remove(ca, iter.pos.offset);
-       if (!ret)
-               s->seen++;
-       bch2_trans_iter_exit(trans, &iter);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static void bch2_do_discards_work(struct work_struct *work)
-{
-       struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
-       struct bch_fs *c = ca->fs;
-       struct discard_buckets_state s = {};
-       struct bpos discard_pos_done = POS_MAX;
-       int ret;
-
-       /*
-        * We're doing the commit in bch2_discard_one_bucket instead of using
-        * for_each_btree_key_commit() so that we can increment counters after
-        * successful commit:
-        */
-       ret = bch2_trans_run(c,
-               for_each_btree_key_max(trans, iter,
-                                  BTREE_ID_need_discard,
-                                  POS(ca->dev_idx, 0),
-                                  POS(ca->dev_idx, U64_MAX), 0, k,
-                       bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
-
-       if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
-               bch2_journal_flush_async(&c->journal, NULL);
-
-       trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
-                             bch2_err_str(ret));
-
-       enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
-}
-
-void bch2_dev_do_discards(struct bch_dev *ca)
-{
-       struct bch_fs *c = ca->fs;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard))
-               return;
-
-       if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards))
-               goto put_write_ref;
-
-       if (queue_work(c->write_ref_wq, &ca->discard_work))
-               return;
-
-       enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
-put_write_ref:
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
-}
-
-void bch2_do_discards(struct bch_fs *c)
-{
-       for_each_member_device(c, ca)
-               bch2_dev_do_discards(ca);
-}
-
-static int bch2_do_discards_fast_one(struct btree_trans *trans,
-                                    struct bch_dev *ca,
-                                    u64 bucket,
-                                    struct bpos *discard_pos_done,
-                                    struct discard_buckets_state *s)
-{
-       struct btree_iter need_discard_iter;
-       struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
-                                       BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
-       int ret = bkey_err(discard_k);
-       if (ret)
-               return ret;
-
-       if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
-                           trans, discarding_bucket_not_in_need_discard_btree,
-                           "attempting to discard bucket %u:%llu not in need_discard btree",
-                           ca->dev_idx, bucket))
-               goto out;
-
-       ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
-out:
-fsck_err:
-       bch2_trans_iter_exit(trans, &need_discard_iter);
-       return ret;
-}
-
-static void bch2_do_discards_fast_work(struct work_struct *work)
-{
-       struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
-       struct bch_fs *c = ca->fs;
-       struct discard_buckets_state s = {};
-       struct bpos discard_pos_done = POS_MAX;
-       struct btree_trans *trans = bch2_trans_get(c);
-       int ret = 0;
-
-       while (1) {
-               bool got_bucket = false;
-               u64 bucket;
-
-               mutex_lock(&ca->discard_buckets_in_flight_lock);
-               darray_for_each(ca->discard_buckets_in_flight, i) {
-                       if (i->in_progress)
-                               continue;
-
-                       got_bucket = true;
-                       bucket = i->bucket;
-                       i->in_progress = true;
-                       break;
-               }
-               mutex_unlock(&ca->discard_buckets_in_flight_lock);
-
-               if (!got_bucket)
-                       break;
-
-               ret = lockrestart_do(trans,
-                       bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
-               bch_err_fn(c, ret);
-
-               discard_in_flight_remove(ca, bucket);
-
-               if (ret)
-                       break;
-       }
-
-       trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
-
-       bch2_trans_put(trans);
-       enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
-}
-
-static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
-{
-       struct bch_fs *c = ca->fs;
-
-       if (discard_in_flight_add(ca, bucket, false))
-               return;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast))
-               return;
-
-       if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast))
-               goto put_ref;
-
-       if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
-               return;
-
-       enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
-put_ref:
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
-}
-
-static int invalidate_one_bp(struct btree_trans *trans,
-                            struct bch_dev *ca,
-                            struct bkey_s_c_backpointer bp,
-                            struct bkey_buf *last_flushed)
-{
-       struct btree_iter extent_iter;
-       struct bkey_s_c extent_k =
-               bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed);
-       int ret = bkey_err(extent_k);
-       if (ret)
-               return ret;
-
-       if (!extent_k.k)
-               return 0;
-
-       struct bkey_i *n =
-               bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
-                                  BTREE_UPDATE_internal_snapshot_node);
-       ret = PTR_ERR_OR_ZERO(n);
-       if (ret)
-               goto err;
-
-       bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx);
-err:
-       bch2_trans_iter_exit(trans, &extent_iter);
-       return ret;
-}
-
-static int invalidate_one_bucket_by_bps(struct btree_trans *trans,
-                                       struct bch_dev *ca,
-                                       struct bpos bucket,
-                                       u8 gen,
-                                       struct bkey_buf *last_flushed)
-{
-       struct bpos bp_start    = bucket_pos_to_bp_start(ca,    bucket);
-       struct bpos bp_end      = bucket_pos_to_bp_end(ca,      bucket);
-
-       return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
-                                     bp_start, bp_end, 0, k,
-                                     NULL, NULL,
-                                     BCH_WATERMARK_btree|
-                                     BCH_TRANS_COMMIT_no_enospc, ({
-               if (k.k->type != KEY_TYPE_backpointer)
-                       continue;
-
-               struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
-               if (bp.v->bucket_gen != gen)
-                       continue;
-
-               /* filter out bps with gens that don't match */
-
-               invalidate_one_bp(trans, ca, bp, last_flushed);
-       }));
-}
-
-noinline_for_stack
-static int invalidate_one_bucket(struct btree_trans *trans,
-                                struct bch_dev *ca,
-                                struct btree_iter *lru_iter,
-                                struct bkey_s_c lru_k,
-                                struct bkey_buf *last_flushed,
-                                s64 *nr_to_invalidate)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
-       struct btree_iter alloc_iter = {};
-       int ret = 0;
-
-       if (*nr_to_invalidate <= 0)
-               return 1;
-
-       if (!bch2_dev_bucket_exists(c, bucket)) {
-               if (fsck_err(trans, lru_entry_to_invalid_bucket,
-                            "lru key points to nonexistent device:bucket %llu:%llu",
-                            bucket.inode, bucket.offset))
-                       return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
-               goto out;
-       }
-
-       if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
-               return 0;
-
-       struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
-                                                    BTREE_ID_alloc, bucket,
-                                                    BTREE_ITER_cached);
-       ret = bkey_err(alloc_k);
-       if (ret)
-               return ret;
-
-       struct bch_alloc_v4 a_convert;
-       const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-       /* We expect harmless races here due to the btree write buffer: */
-       if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a))
-               goto out;
-
-       /*
-        * Impossible since alloc_lru_idx_read() only returns nonzero if the
-        * bucket is supposed to be on the cached bucket LRU (i.e.
-        * BCH_DATA_cached)
-        *
-        * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0
-        */
-       BUG_ON(a->data_type != BCH_DATA_cached);
-       BUG_ON(a->dirty_sectors);
-
-       if (!a->cached_sectors) {
-               bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset,
-                                                      true, last_flushed);
-               goto out;
-       }
-
-       unsigned cached_sectors = a->cached_sectors;
-       u8 gen = a->gen;
-
-       ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed);
-       if (ret)
-               goto out;
-
-       trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
-       --*nr_to_invalidate;
-out:
-fsck_err:
-       bch2_trans_iter_exit(trans, &alloc_iter);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
-                                   struct bch_dev *ca, bool *wrapped)
-{
-       struct bkey_s_c k;
-again:
-       k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
-       if (!k.k && !*wrapped) {
-               bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0));
-               *wrapped = true;
-               goto again;
-       }
-
-       return k;
-}
-
-static void bch2_do_invalidates_work(struct work_struct *work)
-{
-       struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
-       struct bch_fs *c = ca->fs;
-       struct btree_trans *trans = bch2_trans_get(c);
-       int ret = 0;
-
-       struct bkey_buf last_flushed;
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-
-       ret = bch2_btree_write_buffer_tryflush(trans);
-       if (ret)
-               goto err;
-
-       s64 nr_to_invalidate =
-               should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
-       struct btree_iter iter;
-       bool wrapped = false;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
-                            lru_pos(ca->dev_idx, 0,
-                                    ((bch2_current_io_time(c, READ) + U32_MAX) &
-                                     LRU_TIME_MAX)), 0);
-
-       while (true) {
-               bch2_trans_begin(trans);
-
-               struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
-               ret = bkey_err(k);
-               if (ret)
-                       goto restart_err;
-               if (!k.k)
-                       break;
-
-               ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate);
-restart_err:
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       break;
-
-               bch2_btree_iter_advance(trans, &iter);
-       }
-       bch2_trans_iter_exit(trans, &iter);
-err:
-       bch2_trans_put(trans);
-       bch2_bkey_buf_exit(&last_flushed, c);
-       enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
-}
-
-void bch2_dev_do_invalidates(struct bch_dev *ca)
-{
-       struct bch_fs *c = ca->fs;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate))
-               return;
-
-       if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates))
-               goto put_ref;
-
-       if (queue_work(c->write_ref_wq, &ca->invalidate_work))
-               return;
-
-       enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
-put_ref:
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
-}
-
-void bch2_do_invalidates(struct bch_fs *c)
-{
-       for_each_member_device(c, ca)
-               bch2_dev_do_invalidates(ca);
-}
-
-int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
-                           u64 bucket_start, u64 bucket_end)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey hole;
-       struct bpos end = POS(ca->dev_idx, bucket_end);
-       struct bch_member *m;
-       unsigned long last_updated = jiffies;
-       int ret;
-
-       BUG_ON(bucket_start > bucket_end);
-       BUG_ON(bucket_end > ca->mi.nbuckets);
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-               POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
-               BTREE_ITER_prefetch);
-       /*
-        * Scan the alloc btree for every bucket on @ca, and add buckets to the
-        * freespace/need_discard/need_gc_gens btrees as needed:
-        */
-       while (1) {
-               if (time_after(jiffies, last_updated + HZ * 10)) {
-                       bch_info(ca, "%s: currently at %llu/%llu",
-                                __func__, iter.pos.offset, ca->mi.nbuckets);
-                       last_updated = jiffies;
-               }
-
-               bch2_trans_begin(trans);
-
-               if (bkey_ge(iter.pos, end)) {
-                       ret = 0;
-                       break;
-               }
-
-               k = bch2_get_key_or_hole(trans, &iter, end, &hole);
-               ret = bkey_err(k);
-               if (ret)
-                       goto bkey_err;
-
-               if (k.k->type) {
-                       /*
-                        * We process live keys in the alloc btree one at a
-                        * time:
-                        */
-                       struct bch_alloc_v4 a_convert;
-                       const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-
-                       ret =   bch2_bucket_do_index(trans, ca, k, a, true) ?:
-                               bch2_trans_commit(trans, NULL, NULL,
-                                                 BCH_TRANS_COMMIT_no_enospc);
-                       if (ret)
-                               goto bkey_err;
-
-                       bch2_btree_iter_advance(trans, &iter);
-               } else {
-                       struct bkey_i *freespace;
-
-                       freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
-                       ret = PTR_ERR_OR_ZERO(freespace);
-                       if (ret)
-                               goto bkey_err;
-
-                       bkey_init(&freespace->k);
-                       freespace->k.type       = KEY_TYPE_set;
-                       freespace->k.p          = k.k->p;
-                       freespace->k.size       = k.k->size;
-
-                       ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
-                               bch2_trans_commit(trans, NULL, NULL,
-                                                 BCH_TRANS_COMMIT_no_enospc);
-                       if (ret)
-                               goto bkey_err;
-
-                       bch2_btree_iter_set_pos(trans, &iter, k.k->p);
-               }
-bkey_err:
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       break;
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-
-       if (ret < 0) {
-               bch_err_msg(ca, ret, "initializing free space");
-               return ret;
-       }
-
-       mutex_lock(&c->sb_lock);
-       m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-       SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
-       mutex_unlock(&c->sb_lock);
-
-       return 0;
-}
-
-int bch2_fs_freespace_init(struct bch_fs *c)
-{
-       if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image))
-               return 0;
-
-
-       /*
-        * We can crash during the device add path, so we need to check this on
-        * every mount:
-        */
-
-       bool doing_init = false;
-       for_each_member_device(c, ca) {
-               if (ca->mi.freespace_initialized)
-                       continue;
-
-               if (!doing_init) {
-                       bch_info(c, "initializing freespace");
-                       doing_init = true;
-               }
-
-               int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
-               if (ret) {
-                       bch2_dev_put(ca);
-                       bch_err_fn(c, ret);
-                       return ret;
-               }
-       }
-
-       if (doing_init) {
-               mutex_lock(&c->sb_lock);
-               bch2_write_super(c);
-               mutex_unlock(&c->sb_lock);
-               bch_verbose(c, "done initializing freespace");
-       }
-
-       return 0;
-}
-
-/* device removal */
-
-int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
-       struct bpos start       = POS(ca->dev_idx, 0);
-       struct bpos end         = POS(ca->dev_idx, U64_MAX);
-       int ret;
-
-       /*
-        * We clear the LRU and need_discard btrees first so that we don't race
-        * with bch2_do_invalidates() and bch2_do_discards()
-        */
-       ret =   bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
-                                       BTREE_TRIGGER_norun, NULL) ?:
-               bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
-                                       BTREE_TRIGGER_norun, NULL) ?:
-               bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
-                                       BTREE_TRIGGER_norun, NULL) ?:
-               bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
-                                       BTREE_TRIGGER_norun, NULL) ?:
-               bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
-                                       BTREE_TRIGGER_norun, NULL) ?:
-               bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
-                                       BTREE_TRIGGER_norun, NULL) ?:
-               bch2_dev_usage_remove(c, ca->dev_idx);
-       bch_err_msg(ca, ret, "removing dev alloc info");
-       return ret;
-}
-
-/* Bucket IO clocks: */
-
-static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
-                               size_t bucket_nr, int rw)
-{
-       struct bch_fs *c = trans->c;
-
-       struct btree_iter iter;
-       struct bkey_i_alloc_v4 *a =
-               bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
-       int ret = PTR_ERR_OR_ZERO(a);
-       if (ret)
-               return ret;
-
-       u64 now = bch2_current_io_time(c, rw);
-       if (a->v.io_time[rw] == now)
-               goto out;
-
-       a->v.io_time[rw] = now;
-
-       ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
-               bch2_trans_commit(trans, NULL, NULL, 0);
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
-                             size_t bucket_nr, int rw)
-{
-       if (bch2_trans_relock(trans))
-               bch2_trans_begin(trans);
-
-       return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
-}
-
-/* Startup/shutdown (ro/rw): */
-
-void bch2_recalc_capacity(struct bch_fs *c)
-{
-       u64 capacity = 0, reserved_sectors = 0, gc_reserve;
-       unsigned bucket_size_max = 0;
-       unsigned long ra_pages = 0;
-
-       lockdep_assert_held(&c->state_lock);
-
-       guard(rcu)();
-       for_each_member_device_rcu(c, ca, NULL) {
-               struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev);
-               if (bdev)
-                       ra_pages += bdev->bd_disk->bdi->ra_pages;
-
-               if (ca->mi.state != BCH_MEMBER_STATE_rw)
-                       continue;
-
-               u64 dev_reserve = 0;
-
-               /*
-                * We need to reserve buckets (from the number
-                * of currently available buckets) against
-                * foreground writes so that mainly copygc can
-                * make forward progress.
-                *
-                * We need enough to refill the various reserves
-                * from scratch - copygc will use its entire
-                * reserve all at once, then run against when
-                * its reserve is refilled (from the formerly
-                * available buckets).
-                *
-                * This reserve is just used when considering if
-                * allocations for foreground writes must wait -
-                * not -ENOSPC calculations.
-                */
-
-               dev_reserve += ca->nr_btree_reserve * 2;
-               dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
-
-               dev_reserve += 1;       /* btree write point */
-               dev_reserve += 1;       /* copygc write point */
-               dev_reserve += 1;       /* rebalance write point */
-
-               dev_reserve *= ca->mi.bucket_size;
-
-               capacity += bucket_to_sector(ca, ca->mi.nbuckets -
-                                            ca->mi.first_bucket);
-
-               reserved_sectors += dev_reserve * 2;
-
-               bucket_size_max = max_t(unsigned, bucket_size_max,
-                                       ca->mi.bucket_size);
-       }
-
-       bch2_set_ra_pages(c, ra_pages);
-
-       gc_reserve = c->opts.gc_reserve_bytes
-               ? c->opts.gc_reserve_bytes >> 9
-               : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
-
-       reserved_sectors = max(gc_reserve, reserved_sectors);
-
-       reserved_sectors = min(reserved_sectors, capacity);
-
-       c->reserved = reserved_sectors;
-       c->capacity = capacity - reserved_sectors;
-
-       c->bucket_size_max = bucket_size_max;
-
-       /* Wake up case someone was waiting for buckets */
-       closure_wake_up(&c->freelist_wait);
-}
-
-u64 bch2_min_rw_member_capacity(struct bch_fs *c)
-{
-       u64 ret = U64_MAX;
-
-       guard(rcu)();
-       for_each_rw_member_rcu(c, ca)
-               ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
-       return ret;
-}
-
-static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
-{
-       struct open_bucket *ob;
-
-       for (ob = c->open_buckets;
-            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-            ob++) {
-               scoped_guard(spinlock, &ob->lock) {
-                       if (ob->valid && !ob->on_partial_list &&
-                           ob->dev == ca->dev_idx)
-                               return true;
-               }
-       }
-
-       return false;
-}
-
-void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw)
-{
-       /* BCH_DATA_free == all rw devs */
-
-       for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
-               if (rw &&
-                   (i == BCH_DATA_free ||
-                    (ca->mi.data_allowed & BIT(i))))
-                       set_bit(ca->dev_idx, c->rw_devs[i].d);
-               else
-                       clear_bit(ca->dev_idx, c->rw_devs[i].d);
-}
-
-/* device goes ro: */
-void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
-{
-       lockdep_assert_held(&c->state_lock);
-
-       /* First, remove device from allocation groups: */
-       bch2_dev_allocator_set_rw(c, ca, false);
-
-       c->rw_devs_change_count++;
-
-       /*
-        * Capacity is calculated based off of devices in allocation groups:
-        */
-       bch2_recalc_capacity(c);
-
-       bch2_open_buckets_stop(c, ca, false);
-
-       /*
-        * Wake up threads that were blocked on allocation, so they can notice
-        * the device can no longer be removed and the capacity has changed:
-        */
-       closure_wake_up(&c->freelist_wait);
-
-       /*
-        * journal_res_get() can block waiting for free space in the journal -
-        * it needs to notice there may not be devices to allocate from anymore:
-        */
-       wake_up(&c->journal.wait);
-
-       /* Now wait for any in flight writes: */
-
-       closure_wait_event(&c->open_buckets_wait,
-                          !bch2_dev_has_open_write_point(c, ca));
-}
-
-/* device goes rw: */
-void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
-{
-       lockdep_assert_held(&c->state_lock);
-
-       bch2_dev_allocator_set_rw(c, ca, true);
-       c->rw_devs_change_count++;
-}
-
-void bch2_dev_allocator_background_exit(struct bch_dev *ca)
-{
-       darray_exit(&ca->discard_buckets_in_flight);
-}
-
-void bch2_dev_allocator_background_init(struct bch_dev *ca)
-{
-       mutex_init(&ca->discard_buckets_in_flight_lock);
-       INIT_WORK(&ca->discard_work, bch2_do_discards_work);
-       INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
-       INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
-}
-
-void bch2_fs_allocator_background_init(struct bch_fs *c)
-{
-       spin_lock_init(&c->freelist_lock);
-}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
deleted file mode 100644 (file)
index 0cc5adc..0000000
+++ /dev/null
@@ -1,361 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
-#define _BCACHEFS_ALLOC_BACKGROUND_H
-
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "buckets.h"
-#include "debug.h"
-#include "super.h"
-
-/* How out of date a pointer gen is allowed to be: */
-#define BUCKET_GC_GEN_MAX      96U
-
-static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
-{
-       guard(rcu)();
-       struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
-       return ca && bucket_valid(ca, pos.offset);
-}
-
-static inline u64 bucket_to_u64(struct bpos bucket)
-{
-       return (bucket.inode << 48) | bucket.offset;
-}
-
-static inline struct bpos u64_to_bucket(u64 bucket)
-{
-       return POS(bucket >> 48, bucket & ~(~0ULL << 48));
-}
-
-static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
-{
-       return a.gen - a.oldest_gen;
-}
-
-static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
-{
-       dst->gen                = src.gen;
-       dst->data_type          = src.data_type;
-       dst->stripe_sectors     = src.stripe_sectors;
-       dst->dirty_sectors      = src.dirty_sectors;
-       dst->cached_sectors     = src.cached_sectors;
-       dst->stripe             = src.stripe;
-}
-
-static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src)
-{
-       dst->gen                = src.gen;
-       dst->data_type          = src.data_type;
-       dst->stripe_sectors     = src.stripe_sectors;
-       dst->dirty_sectors      = src.dirty_sectors;
-       dst->cached_sectors     = src.cached_sectors;
-       dst->stripe             = src.stripe;
-}
-
-static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
-{
-       struct bch_alloc_v4 ret = {};
-       __bucket_m_to_alloc(&ret, b);
-       return ret;
-}
-
-static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
-{
-       switch (data_type) {
-       case BCH_DATA_cached:
-       case BCH_DATA_stripe:
-               return BCH_DATA_user;
-       default:
-               return data_type;
-       }
-}
-
-static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
-                                            enum bch_data_type ptr)
-{
-       return !data_type_is_empty(bucket) &&
-               bucket_data_type(bucket) != bucket_data_type(ptr);
-}
-
-/*
- * It is my general preference to use unsigned types for unsigned quantities -
- * however, these helpers are used in disk accounting calculations run by
- * triggers where the output will be negated and added to an s64. unsigned is
- * right out even though all these quantities will fit in 32 bits, since it
- * won't be sign extended correctly; u64 will negate "correctly", but s64 is the
- * simpler option here.
- */
-static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
-{
-       return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
-}
-
-static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
-{
-       return a.stripe_sectors + a.dirty_sectors;
-}
-
-static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
-{
-       return a.data_type == BCH_DATA_cached
-               ? a.cached_sectors
-               : bch2_bucket_sectors_dirty(a);
-}
-
-static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
-                                                struct bch_alloc_v4 a)
-{
-       int d = bch2_bucket_sectors(a);
-
-       return d ? max(0, ca->mi.bucket_size - d) : 0;
-}
-
-static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
-{
-       int d = a.stripe_sectors + a.dirty_sectors;
-
-       return d ? max(0, ca->mi.bucket_size - d) : 0;
-}
-
-static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
-{
-       return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
-}
-
-static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
-                                                enum bch_data_type data_type)
-{
-       if (a.stripe)
-               return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
-       if (bch2_bucket_sectors_dirty(a))
-               return bucket_data_type(data_type);
-       if (a.cached_sectors)
-               return BCH_DATA_cached;
-       if (BCH_ALLOC_V4_NEED_DISCARD(&a))
-               return BCH_DATA_need_discard;
-       if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
-               return BCH_DATA_need_gc_gens;
-       return BCH_DATA_free;
-}
-
-static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type)
-{
-       a->data_type = alloc_data_type(*a, data_type);
-}
-
-static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
-{
-       return a.data_type == BCH_DATA_cached
-               ? a.io_time[READ] & LRU_TIME_MAX
-               : 0;
-}
-
-#define DATA_TYPES_MOVABLE             \
-       ((1U << BCH_DATA_btree)|        \
-        (1U << BCH_DATA_user)|         \
-        (1U << BCH_DATA_stripe))
-
-static inline bool data_type_movable(enum bch_data_type type)
-{
-       return (1U << type) & DATA_TYPES_MOVABLE;
-}
-
-static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
-                                             struct bch_dev *ca)
-{
-       if (a.data_type >= BCH_DATA_NR)
-               return 0;
-
-       if (!data_type_movable(a.data_type) ||
-           !bch2_bucket_sectors_fragmented(ca, a))
-               return 0;
-
-       /*
-        * avoid overflowing LRU_TIME_BITS on a corrupted fs, when
-        * bucket_sectors_dirty is (much) bigger than bucket_size
-        */
-       u64 d = min_t(s64, bch2_bucket_sectors_dirty(a),
-                     ca->mi.bucket_size);
-
-       return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
-}
-
-static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
-{
-       return ((u64) alloc_gc_gen(a) >> 4) << 56;
-}
-
-static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
-{
-       pos.offset |= alloc_freespace_genbits(a);
-       return pos;
-}
-
-static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a)
-{
-       return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
-                       BCH_ALLOC_V4_U64s_V0) +
-               BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
-               (sizeof(struct bch_backpointer) / sizeof(u64));
-}
-
-static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
-{
-       unsigned ret = alloc_v4_u64s_noerror(a);
-       BUG_ON(ret > U8_MAX - BKEY_U64s);
-       return ret;
-}
-
-static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
-{
-       set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct bpos,
-                             enum btree_iter_update_trigger_flags);
-
-void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
-
-static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
-{
-       const struct bch_alloc_v4 *ret;
-
-       if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
-               goto slowpath;
-
-       ret = bkey_s_c_to_alloc_v4(k).v;
-       if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
-               goto slowpath;
-
-       return ret;
-slowpath:
-       __bch2_alloc_to_v4(k, convert);
-       return convert;
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
-
-int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-
-int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c,
-                          struct bkey_validate_context);
-int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c,
-                          struct bkey_validate_context);
-int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c,
-                          struct bkey_validate_context);
-int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
-                          struct bkey_validate_context);
-void bch2_alloc_v4_swab(struct bkey_s);
-void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_alloc ((struct bkey_ops) {       \
-       .key_validate   = bch2_alloc_v1_validate,       \
-       .val_to_text    = bch2_alloc_to_text,           \
-       .trigger        = bch2_trigger_alloc,           \
-       .min_val_size   = 8,                            \
-})
-
-#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) {    \
-       .key_validate   = bch2_alloc_v2_validate,       \
-       .val_to_text    = bch2_alloc_to_text,           \
-       .trigger        = bch2_trigger_alloc,           \
-       .min_val_size   = 8,                            \
-})
-
-#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) {    \
-       .key_validate   = bch2_alloc_v3_validate,       \
-       .val_to_text    = bch2_alloc_to_text,           \
-       .trigger        = bch2_trigger_alloc,           \
-       .min_val_size   = 16,                           \
-})
-
-#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {    \
-       .key_validate   = bch2_alloc_v4_validate,       \
-       .val_to_text    = bch2_alloc_v4_to_text,        \
-       .swab           = bch2_alloc_v4_swab,           \
-       .trigger        = bch2_trigger_alloc,           \
-       .min_val_size   = 48,                           \
-})
-
-int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
-                             struct bkey_validate_context);
-void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
-       .key_validate   = bch2_bucket_gens_validate,    \
-       .val_to_text    = bch2_bucket_gens_to_text,     \
-})
-
-int bch2_bucket_gens_init(struct bch_fs *);
-
-static inline bool bkey_is_alloc(const struct bkey *k)
-{
-       return  k->type == KEY_TYPE_alloc ||
-               k->type == KEY_TYPE_alloc_v2 ||
-               k->type == KEY_TYPE_alloc_v3;
-}
-
-int bch2_alloc_read(struct bch_fs *);
-
-int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
-                                  const struct bch_alloc_v4 *,
-                                  const struct bch_alloc_v4 *, unsigned);
-int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
-                      struct bkey_s_c, struct bkey_s,
-                      enum btree_iter_update_trigger_flags);
-
-int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool);
-int bch2_check_alloc_info(struct bch_fs *);
-int bch2_check_alloc_to_lru_refs(struct bch_fs *);
-void bch2_dev_do_discards(struct bch_dev *);
-void bch2_do_discards(struct bch_fs *);
-
-static inline u64 should_invalidate_buckets(struct bch_dev *ca,
-                                           struct bch_dev_usage u)
-{
-       u64 want_free = ca->mi.nbuckets >> 7;
-       u64 free = max_t(s64, 0,
-                          u.buckets[BCH_DATA_free]
-                        + u.buckets[BCH_DATA_need_discard]
-                        - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
-
-       return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]);
-}
-
-void bch2_dev_do_invalidates(struct bch_dev *);
-void bch2_do_invalidates(struct bch_fs *);
-
-static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
-{
-       return (void *) ((u64 *) &a->v +
-                        (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
-                         BCH_ALLOC_V4_U64s_V0));
-}
-
-static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
-{
-       return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
-}
-
-int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
-int bch2_fs_freespace_init(struct bch_fs *);
-int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
-
-void bch2_recalc_capacity(struct bch_fs *);
-u64 bch2_min_rw_member_capacity(struct bch_fs *);
-
-void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool);
-void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-
-void bch2_dev_allocator_background_exit(struct bch_dev *);
-void bch2_dev_allocator_background_init(struct bch_dev *);
-
-void bch2_fs_allocator_background_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
deleted file mode 100644 (file)
index 7402383..0000000
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
-#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
-
-struct bch_alloc {
-       struct bch_val          v;
-       __u8                    fields;
-       __u8                    gen;
-       __u8                    data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1()                  \
-       x(read_time,            16)             \
-       x(write_time,           16)             \
-       x(data_type,            8)              \
-       x(dirty_sectors,        16)             \
-       x(cached_sectors,       16)             \
-       x(oldest_gen,           8)              \
-       x(stripe,               32)             \
-       x(stripe_redundancy,    8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
-       BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
-       struct bch_val          v;
-       __u8                    nr_fields;
-       __u8                    gen;
-       __u8                    oldest_gen;
-       __u8                    data_type;
-       __u8                    data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2()                  \
-       x(read_time,            64)             \
-       x(write_time,           64)             \
-       x(dirty_sectors,        32)             \
-       x(cached_sectors,       32)             \
-       x(stripe,               32)             \
-       x(stripe_redundancy,    8)
-
-struct bch_alloc_v3 {
-       struct bch_val          v;
-       __le64                  journal_seq;
-       __le32                  flags;
-       __u8                    nr_fields;
-       __u8                    gen;
-       __u8                    oldest_gen;
-       __u8                    data_type;
-       __u8                    data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
-
-struct bch_alloc_v4 {
-       struct bch_val          v;
-       __u64                   journal_seq_nonempty;
-       __u32                   flags;
-       __u8                    gen;
-       __u8                    oldest_gen;
-       __u8                    data_type;
-       __u8                    stripe_redundancy;
-       __u32                   dirty_sectors;
-       __u32                   cached_sectors;
-       __u64                   io_time[2];
-       __u32                   stripe;
-       __u32                   nr_external_backpointers;
-       /* end of fields in original version of alloc_v4 */
-       __u64                   journal_seq_empty;
-       __u32                   stripe_sectors;
-       __u32                   pad;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0   6
-#define BCH_ALLOC_V4_U64s      (sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD,     struct bch_alloc_v4, flags,  0,  1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,     struct bch_alloc_v4, flags,  1,  2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,  struct bch_alloc_v4, flags,  8,  14)
-
-#define KEY_TYPE_BUCKET_GENS_BITS      8
-#define KEY_TYPE_BUCKET_GENS_NR                (1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK      (KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
-       struct bch_val          v;
-       u8                      gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
deleted file mode 100644 (file)
index b58525e..0000000
+++ /dev/null
@@ -1,1683 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2012 Google, Inc.
- *
- * Foreground allocator code: allocate buckets from freelist, and allocate in
- * sector granularity from writepoints.
- *
- * bch2_bucket_alloc() allocates a single bucket from a specific device.
- *
- * bch2_bucket_alloc_set() allocates one or more buckets from different devices
- * in a given filesystem.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "io_write.h"
-#include "journal.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "trace.h"
-
-#include <linux/math64.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-
-static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
-                                          struct mutex *lock)
-{
-       if (!mutex_trylock(lock)) {
-               bch2_trans_unlock(trans);
-               mutex_lock(lock);
-       }
-}
-
-const char * const bch2_watermarks[] = {
-#define x(t) #t,
-       BCH_WATERMARKS()
-#undef x
-       NULL
-};
-
-/*
- * Open buckets represent a bucket that's currently being allocated from.  They
- * serve two purposes:
- *
- *  - They track buckets that have been partially allocated, allowing for
- *    sub-bucket sized allocations - they're used by the sector allocator below
- *
- *  - They provide a reference to the buckets they own that mark and sweep GC
- *    can find, until the new allocation has a pointer to it inserted into the
- *    btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
-
-void bch2_reset_alloc_cursors(struct bch_fs *c)
-{
-       guard(rcu)();
-       for_each_member_device_rcu(c, ca, NULL)
-               memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
-}
-
-static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
-{
-       open_bucket_idx_t idx = ob - c->open_buckets;
-       open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
-
-       ob->hash = *slot;
-       *slot = idx;
-}
-
-static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
-{
-       open_bucket_idx_t idx = ob - c->open_buckets;
-       open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
-
-       while (*slot != idx) {
-               BUG_ON(!*slot);
-               slot = &c->open_buckets[*slot].hash;
-       }
-
-       *slot = ob->hash;
-       ob->hash = 0;
-}
-
-void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-       struct bch_dev *ca = ob_dev(c, ob);
-
-       if (ob->ec) {
-               ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
-               return;
-       }
-
-       spin_lock(&ob->lock);
-       ob->valid = false;
-       ob->data_type = 0;
-       spin_unlock(&ob->lock);
-
-       spin_lock(&c->freelist_lock);
-       bch2_open_bucket_hash_remove(c, ob);
-
-       ob->freelist = c->open_buckets_freelist;
-       c->open_buckets_freelist = ob - c->open_buckets;
-
-       c->open_buckets_nr_free++;
-       ca->nr_open_buckets--;
-       spin_unlock(&c->freelist_lock);
-
-       closure_wake_up(&c->open_buckets_wait);
-}
-
-void bch2_open_bucket_write_error(struct bch_fs *c,
-                                 struct open_buckets *obs,
-                                 unsigned dev, int err)
-{
-       struct open_bucket *ob;
-       unsigned i;
-
-       open_bucket_for_each(c, obs, ob, i)
-               if (ob->dev == dev && ob->ec)
-                       bch2_ec_bucket_cancel(c, ob, err);
-}
-
-static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
-{
-       struct open_bucket *ob;
-
-       BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
-
-       ob = c->open_buckets + c->open_buckets_freelist;
-       c->open_buckets_freelist = ob->freelist;
-       atomic_set(&ob->pin, 1);
-       ob->data_type = 0;
-
-       c->open_buckets_nr_free--;
-       return ob;
-}
-
-static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
-       if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs))
-               return false;
-
-       return bch2_is_superblock_bucket(ca, b);
-}
-
-static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
-{
-       BUG_ON(c->open_buckets_partial_nr >=
-              ARRAY_SIZE(c->open_buckets_partial));
-
-       spin_lock(&c->freelist_lock);
-       scoped_guard(rcu)
-               bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
-
-       ob->on_partial_list = true;
-       c->open_buckets_partial[c->open_buckets_partial_nr++] =
-               ob - c->open_buckets;
-       spin_unlock(&c->freelist_lock);
-
-       closure_wake_up(&c->open_buckets_wait);
-       closure_wake_up(&c->freelist_wait);
-}
-
-static inline bool may_alloc_bucket(struct bch_fs *c,
-                                   struct alloc_request *req,
-                                   struct bpos bucket)
-{
-       if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
-               req->counters.skipped_open++;
-               return false;
-       }
-
-       u64 journal_seq_ready =
-               bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
-                                             bucket.inode, bucket.offset);
-       if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
-               if (journal_seq_ready > c->journal.flushing_seq)
-                       req->counters.need_journal_commit++;
-               req->counters.skipped_need_journal_commit++;
-               return false;
-       }
-
-       if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
-               req->counters.skipped_nocow++;
-               return false;
-       }
-
-       return true;
-}
-
-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
-                                             struct alloc_request *req,
-                                             u64 bucket, u8 gen,
-                                             struct closure *cl)
-{
-       struct bch_dev *ca = req->ca;
-
-       if (unlikely(is_superblock_bucket(c, ca, bucket)))
-               return NULL;
-
-       if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
-               req->counters.skipped_nouse++;
-               return NULL;
-       }
-
-       spin_lock(&c->freelist_lock);
-
-       if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) {
-               if (cl)
-                       closure_wait(&c->open_buckets_wait, cl);
-
-               track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
-               spin_unlock(&c->freelist_lock);
-               return ERR_PTR(bch_err_throw(c, open_buckets_empty));
-       }
-
-       /* Recheck under lock: */
-       if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
-               spin_unlock(&c->freelist_lock);
-               req->counters.skipped_open++;
-               return NULL;
-       }
-
-       struct open_bucket *ob = bch2_open_bucket_alloc(c);
-
-       spin_lock(&ob->lock);
-       ob->valid       = true;
-       ob->sectors_free = ca->mi.bucket_size;
-       ob->dev         = ca->dev_idx;
-       ob->gen         = gen;
-       ob->bucket      = bucket;
-       spin_unlock(&ob->lock);
-
-       ca->nr_open_buckets++;
-       bch2_open_bucket_hash_add(c, ob);
-
-       track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
-       track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
-
-       spin_unlock(&c->freelist_lock);
-       return ob;
-}
-
-static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
-                                           struct alloc_request *req,
-                                           struct btree_iter *freespace_iter,
-                                           struct closure *cl)
-{
-       struct bch_fs *c = trans->c;
-       u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
-
-       if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b)))
-               return NULL;
-
-       u8 gen;
-       int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
-       if (ret < 0)
-               return ERR_PTR(ret);
-       if (ret)
-               return NULL;
-
-       return __try_alloc_bucket(c, req, b, gen, cl);
-}
-
-/*
- * This path is for before the freespace btree is initialized:
- */
-static noinline struct open_bucket *
-bch2_bucket_alloc_early(struct btree_trans *trans,
-                       struct alloc_request *req,
-                       struct closure *cl)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_dev *ca = req->ca;
-       struct btree_iter iter, citer;
-       struct bkey_s_c k, ck;
-       struct open_bucket *ob = NULL;
-       u64 first_bucket = ca->mi.first_bucket;
-       u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
-       u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
-       u64 alloc_cursor = alloc_start;
-       int ret;
-
-       /*
-        * Scan with an uncached iterator to avoid polluting the key cache. An
-        * uncached iter will return a cached key if one exists, but if not
-        * there is no other underlying protection for the associated key cache
-        * slot. To avoid racing bucket allocations, look up the cached key slot
-        * of any likely allocation candidate before attempting to proceed with
-        * the allocation. This provides proper exclusion on the associated
-        * bucket.
-        */
-again:
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
-                          BTREE_ITER_slots, k, ret) {
-               u64 bucket = k.k->p.offset;
-
-               if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
-                       break;
-
-               if (req->btree_bitmap != BTREE_BITMAP_ANY &&
-                   req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
-                               bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
-                       if (req->btree_bitmap == BTREE_BITMAP_YES &&
-                           bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
-                               break;
-
-                       bucket = sector_to_bucket(ca,
-                                       round_up(bucket_to_sector(ca, bucket) + 1,
-                                                1ULL << ca->mi.btree_bitmap_shift));
-                       bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket));
-                       req->counters.buckets_seen++;
-                       req->counters.skipped_mi_btree_bitmap++;
-                       continue;
-               }
-
-               struct bch_alloc_v4 a_convert;
-               const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-               if (a->data_type != BCH_DATA_free)
-                       continue;
-
-               /* now check the cached key to serialize concurrent allocs of the bucket */
-               ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached);
-               ret = bkey_err(ck);
-               if (ret)
-                       break;
-
-               a = bch2_alloc_to_v4(ck, &a_convert);
-               if (a->data_type != BCH_DATA_free)
-                       goto next;
-
-               req->counters.buckets_seen++;
-
-               ob = may_alloc_bucket(c, req, k.k->p)
-                       ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl)
-                       : NULL;
-next:
-               bch2_set_btree_iter_dontneed(trans, &citer);
-               bch2_trans_iter_exit(trans, &citer);
-               if (ob)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       alloc_cursor = iter.pos.offset;
-
-       if (!ob && ret)
-               ob = ERR_PTR(ret);
-
-       if (!ob && alloc_start > first_bucket) {
-               alloc_cursor = alloc_start = first_bucket;
-               goto again;
-       }
-
-       *dev_alloc_cursor = alloc_cursor;
-
-       return ob;
-}
-
-static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
-                                                     struct alloc_request *req,
-                                                     struct closure *cl)
-{
-       struct bch_dev *ca = req->ca;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct open_bucket *ob = NULL;
-       u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
-       u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
-       u64 alloc_cursor = alloc_start;
-       int ret;
-again:
-       for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
-                                        POS(ca->dev_idx, alloc_cursor),
-                                        POS(ca->dev_idx, U64_MAX),
-                                        0, k, ret) {
-               /*
-                * peek normally dosen't trim extents - they can span iter.pos,
-                * which is not what we want here:
-                */
-               iter.k.size = iter.k.p.offset - iter.pos.offset;
-
-               while (iter.k.size) {
-                       req->counters.buckets_seen++;
-
-                       u64 bucket = iter.pos.offset & ~(~0ULL << 56);
-                       if (req->btree_bitmap != BTREE_BITMAP_ANY &&
-                           req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
-                                       bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
-                               if (req->btree_bitmap == BTREE_BITMAP_YES &&
-                                   bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
-                                       goto fail;
-
-                               bucket = sector_to_bucket(ca,
-                                               round_up(bucket_to_sector(ca, bucket + 1),
-                                                        1ULL << ca->mi.btree_bitmap_shift));
-                               alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
-
-                               bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor));
-                               req->counters.skipped_mi_btree_bitmap++;
-                               goto next;
-                       }
-
-                       ob = try_alloc_bucket(trans, req, &iter, cl);
-                       if (ob) {
-                               if (!IS_ERR(ob))
-                                       *dev_alloc_cursor = iter.pos.offset;
-                               bch2_set_btree_iter_dontneed(trans, &iter);
-                               break;
-                       }
-
-                       iter.k.size--;
-                       iter.pos.offset++;
-               }
-next:
-               if (ob || ret)
-                       break;
-       }
-fail:
-       bch2_trans_iter_exit(trans, &iter);
-
-       BUG_ON(ob && ret);
-
-       if (ret)
-               ob = ERR_PTR(ret);
-
-       if (!ob && alloc_start > ca->mi.first_bucket) {
-               alloc_cursor = alloc_start = ca->mi.first_bucket;
-               goto again;
-       }
-
-       return ob;
-}
-
-static noinline void trace_bucket_alloc2(struct bch_fs *c,
-                                        struct alloc_request *req,
-                                        struct closure *cl,
-                                        struct open_bucket *ob)
-{
-       struct printbuf buf = PRINTBUF;
-
-       printbuf_tabstop_push(&buf, 24);
-
-       prt_printf(&buf, "dev\t%s (%u)\n",      req->ca->name, req->ca->dev_idx);
-       prt_printf(&buf, "watermark\t%s\n",     bch2_watermarks[req->watermark]);
-       prt_printf(&buf, "data type\t%s\n",     __bch2_data_types[req->data_type]);
-       prt_printf(&buf, "blocking\t%u\n",      cl != NULL);
-       prt_printf(&buf, "free\t%llu\n",        req->usage.buckets[BCH_DATA_free]);
-       prt_printf(&buf, "avail\t%llu\n",       dev_buckets_free(req->ca, req->usage, req->watermark));
-       prt_printf(&buf, "copygc_wait\t%llu/%lli\n",
-                  bch2_copygc_wait_amount(c),
-                  c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
-       prt_printf(&buf, "seen\t%llu\n",        req->counters.buckets_seen);
-       prt_printf(&buf, "open\t%llu\n",        req->counters.skipped_open);
-       prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit);
-       prt_printf(&buf, "nocow\t%llu\n",       req->counters.skipped_nocow);
-       prt_printf(&buf, "nouse\t%llu\n",       req->counters.skipped_nouse);
-       prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap);
-
-       if (!IS_ERR(ob)) {
-               prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
-               trace_bucket_alloc(c, buf.buf);
-       } else {
-               prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob)));
-               trace_bucket_alloc_fail(c, buf.buf);
-       }
-
-       printbuf_exit(&buf);
-}
-
-/**
- * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
- * @trans:     transaction object
- * @req:       state for the entire allocation
- * @cl:                if not NULL, closure to be used to wait if buckets not available
- * @nowait:    if true, do not wait for buckets to become available
- *
- * Returns:    an open_bucket on success, or an ERR_PTR() on failure.
- */
-static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
-                                                  struct alloc_request *req,
-                                                  struct closure *cl,
-                                                  bool nowait)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_dev *ca = req->ca;
-       struct open_bucket *ob = NULL;
-       bool freespace = READ_ONCE(ca->mi.freespace_initialized);
-       u64 avail;
-       bool waiting = nowait;
-
-       req->btree_bitmap = req->data_type == BCH_DATA_btree;
-       memset(&req->counters, 0, sizeof(req->counters));
-again:
-       bch2_dev_usage_read_fast(ca, &req->usage);
-       avail = dev_buckets_free(ca, req->usage, req->watermark);
-
-       if (req->usage.buckets[BCH_DATA_need_discard] >
-           min(avail, ca->mi.nbuckets >> 7))
-               bch2_dev_do_discards(ca);
-
-       if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail)
-               bch2_gc_gens_async(c);
-
-       if (should_invalidate_buckets(ca, req->usage))
-               bch2_dev_do_invalidates(ca);
-
-       if (!avail) {
-               if (req->watermark > BCH_WATERMARK_normal &&
-                   c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
-                       goto alloc;
-
-               if (cl && !waiting) {
-                       closure_wait(&c->freelist_wait, cl);
-                       waiting = true;
-                       goto again;
-               }
-
-               track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
-
-               ob = ERR_PTR(bch_err_throw(c, freelist_empty));
-               goto err;
-       }
-
-       if (waiting)
-               closure_wake_up(&c->freelist_wait);
-alloc:
-       ob = likely(freespace)
-               ? bch2_bucket_alloc_freelist(trans, req, cl)
-               : bch2_bucket_alloc_early(trans, req, cl);
-
-       if (req->counters.need_journal_commit * 2 > avail)
-               bch2_journal_flush_async(&c->journal, NULL);
-
-       if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) {
-               req->btree_bitmap = BTREE_BITMAP_ANY;
-               goto alloc;
-       }
-
-       if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) {
-               freespace = false;
-               goto alloc;
-       }
-err:
-       if (!ob)
-               ob = ERR_PTR(bch_err_throw(c, no_buckets_found));
-
-       if (!IS_ERR(ob))
-               ob->data_type = req->data_type;
-
-       if (!IS_ERR(ob))
-               count_event(c, bucket_alloc);
-       else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
-               count_event(c, bucket_alloc_fail);
-
-       if (!IS_ERR(ob)
-           ? trace_bucket_alloc_enabled()
-           : trace_bucket_alloc_fail_enabled())
-               trace_bucket_alloc2(c, req, cl, ob);
-
-       return ob;
-}
-
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-                                     enum bch_watermark watermark,
-                                     enum bch_data_type data_type,
-                                     struct closure *cl)
-{
-       struct open_bucket *ob;
-       struct alloc_request req = {
-               .watermark      = watermark,
-               .data_type      = data_type,
-               .ca             = ca,
-       };
-
-       bch2_trans_do(c,
-               PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false)));
-       return ob;
-}
-
-static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
-                           unsigned l, unsigned r)
-{
-       return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]);
-}
-
-#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
-
-void bch2_dev_alloc_list(struct bch_fs *c,
-                        struct dev_stripe_state *stripe,
-                        struct bch_devs_mask *devs,
-                        struct dev_alloc_list *ret)
-{
-       ret->nr = 0;
-
-       unsigned i;
-       for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
-               ret->data[ret->nr++] = i;
-
-       bubble_sort(ret->data, ret->nr, dev_stripe_cmp);
-}
-
-static const u64 stripe_clock_hand_rescale     = 1ULL << 62; /* trigger rescale at */
-static const u64 stripe_clock_hand_max         = 1ULL << 56; /* max after rescale */
-static const u64 stripe_clock_hand_inv         = 1ULL << 52; /* max increment, if a device is empty */
-
-static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe)
-{
-       /*
-        * Avoid underflowing clock hands if at all possible, if clock hands go
-        * to 0 then we lose information - clock hands can be in a wide range if
-        * we have devices we rarely try to allocate from, if we generally
-        * allocate from a specified target but only sometimes have to fall back
-        * to the whole filesystem.
-        */
-       u64 scale_max = U64_MAX;        /* maximum we can subtract without underflow */
-       u64 scale_min = 0;              /* minumum we must subtract to avoid overflow */
-
-       for (u64 *v = stripe->next_alloc;
-            v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) {
-               if (*v)
-                       scale_max = min(scale_max, *v);
-               if (*v > stripe_clock_hand_max)
-                       scale_min = max(scale_min, *v - stripe_clock_hand_max);
-       }
-
-       u64 scale = max(scale_min, scale_max);
-
-       for (u64 *v = stripe->next_alloc;
-            v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
-               *v = *v < scale ? 0 : *v - scale;
-}
-
-static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
-                              struct dev_stripe_state *stripe,
-                              struct bch_dev_usage *usage)
-{
-       /*
-        * Stripe state has a per device clock hand: we allocate from the device
-        * with the smallest clock hand.
-        *
-        * When we allocate, we don't do a simple increment; we add the inverse
-        * of the device's free space. This results in round robin behavior that
-        * biases in favor of the device(s) with more free space.
-        */
-
-       u64 *v = stripe->next_alloc + ca->dev_idx;
-       u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
-       u64 free_space_inv = free_space
-               ? div64_u64(stripe_clock_hand_inv, free_space)
-               : stripe_clock_hand_inv;
-
-       /* Saturating add, avoid overflow: */
-       u64 sum = *v + free_space_inv;
-       *v = sum >= *v ? sum : U64_MAX;
-
-       if (unlikely(*v > stripe_clock_hand_rescale))
-               bch2_stripe_state_rescale(stripe);
-}
-
-void bch2_dev_stripe_increment(struct bch_dev *ca,
-                              struct dev_stripe_state *stripe)
-{
-       struct bch_dev_usage usage;
-
-       bch2_dev_usage_read_fast(ca, &usage);
-       bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
-}
-
-static int add_new_bucket(struct bch_fs *c,
-                         struct alloc_request *req,
-                         struct open_bucket *ob)
-{
-       unsigned durability = ob_dev(c, ob)->mi.durability;
-
-       BUG_ON(req->nr_effective >= req->nr_replicas);
-
-       __clear_bit(ob->dev, req->devs_may_alloc.d);
-       req->nr_effective       += durability;
-       req->have_cache |= !durability;
-
-       ob_push(c, &req->ptrs, ob);
-
-       if (req->nr_effective >= req->nr_replicas)
-               return 1;
-       if (ob->ec)
-               return 1;
-       return 0;
-}
-
-inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
-                                      struct alloc_request *req,
-                                      struct dev_stripe_state *stripe,
-                                      struct closure *cl)
-{
-       struct bch_fs *c = trans->c;
-       int ret = 0;
-
-       BUG_ON(req->nr_effective >= req->nr_replicas);
-
-       bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted);
-
-       darray_for_each(req->devs_sorted, i) {
-               req->ca = bch2_dev_tryget_noerror(c, *i);
-               if (!req->ca)
-                       continue;
-
-               if (!req->ca->mi.durability && req->have_cache) {
-                       bch2_dev_put(req->ca);
-                       continue;
-               }
-
-               struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl,
-                                                       req->flags & BCH_WRITE_alloc_nowait);
-               if (!IS_ERR(ob))
-                       bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage);
-               bch2_dev_put(req->ca);
-
-               if (IS_ERR(ob)) {
-                       ret = PTR_ERR(ob);
-                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
-                               break;
-                       continue;
-               }
-
-               ret = add_new_bucket(c, req, ob);
-               if (ret)
-                       break;
-       }
-
-       if (ret == 1)
-               return 0;
-       if (ret)
-               return ret;
-       return bch_err_throw(c, insufficient_devices);
-}
-
-/* Allocate from stripes: */
-
-/*
- * if we can't allocate a new stripe because there are already too many
- * partially filled stripes, force allocating from an existing stripe even when
- * it's to a device we don't want:
- */
-
-static int bucket_alloc_from_stripe(struct btree_trans *trans,
-                                   struct alloc_request *req,
-                                   struct closure *cl)
-{
-       struct bch_fs *c = trans->c;
-       int ret = 0;
-
-       if (req->nr_replicas < 2)
-               return 0;
-
-       if (ec_open_bucket(c, &req->ptrs))
-               return 0;
-
-       struct ec_stripe_head *h =
-               bch2_ec_stripe_head_get(trans, req, 0, cl);
-       if (IS_ERR(h))
-               return PTR_ERR(h);
-       if (!h)
-               return 0;
-
-       bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted);
-
-       darray_for_each(req->devs_sorted, i)
-               for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
-                       if (!h->s->blocks[ec_idx])
-                               continue;
-
-                       struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx];
-                       if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) {
-                               ob->ec_idx      = ec_idx;
-                               ob->ec          = h->s;
-                               ec_stripe_new_get(h->s, STRIPE_REF_io);
-
-                               ret = add_new_bucket(c, req, ob);
-                               goto out;
-                       }
-               }
-out:
-       bch2_ec_stripe_head_put(c, h);
-       return ret;
-}
-
-/* Sector allocator */
-
-static bool want_bucket(struct bch_fs *c,
-                       struct alloc_request *req,
-                       struct open_bucket *ob)
-{
-       struct bch_dev *ca = ob_dev(c, ob);
-
-       if (!test_bit(ob->dev, req->devs_may_alloc.d))
-               return false;
-
-       if (ob->data_type != req->wp->data_type)
-               return false;
-
-       if (!ca->mi.durability &&
-           (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache))
-               return false;
-
-       if (req->ec != (ob->ec != NULL))
-               return false;
-
-       return true;
-}
-
-static int bucket_alloc_set_writepoint(struct bch_fs *c,
-                                      struct alloc_request *req)
-{
-       struct open_bucket *ob;
-       unsigned i;
-       int ret = 0;
-
-       req->scratch_ptrs.nr = 0;
-
-       open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
-               if (!ret && want_bucket(c, req, ob))
-                       ret = add_new_bucket(c, req, ob);
-               else
-                       ob_push(c, &req->scratch_ptrs, ob);
-       }
-       req->wp->ptrs = req->scratch_ptrs;
-
-       return ret;
-}
-
-static int bucket_alloc_set_partial(struct bch_fs *c,
-                                   struct alloc_request *req)
-{
-       int i, ret = 0;
-
-       if (!c->open_buckets_partial_nr)
-               return 0;
-
-       spin_lock(&c->freelist_lock);
-
-       if (!c->open_buckets_partial_nr)
-               goto unlock;
-
-       for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
-               struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
-
-               if (want_bucket(c, req, ob)) {
-                       struct bch_dev *ca = ob_dev(c, ob);
-                       u64 avail;
-
-                       bch2_dev_usage_read_fast(ca, &req->usage);
-                       avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets;
-                       if (!avail)
-                               continue;
-
-                       array_remove_item(c->open_buckets_partial,
-                                         c->open_buckets_partial_nr,
-                                         i);
-                       ob->on_partial_list = false;
-
-                       scoped_guard(rcu)
-                               bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
-
-                       ret = add_new_bucket(c, req, ob);
-                       if (ret)
-                               break;
-               }
-       }
-unlock:
-       spin_unlock(&c->freelist_lock);
-       return ret;
-}
-
-static int __open_bucket_add_buckets(struct btree_trans *trans,
-                                    struct alloc_request *req,
-                                    struct closure *_cl)
-{
-       struct bch_fs *c = trans->c;
-       struct open_bucket *ob;
-       struct closure *cl = NULL;
-       unsigned i;
-       int ret;
-
-       req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
-
-       /* Don't allocate from devices we already have pointers to: */
-       darray_for_each(*req->devs_have, i)
-               __clear_bit(*i, req->devs_may_alloc.d);
-
-       open_bucket_for_each(c, &req->ptrs, ob, i)
-               __clear_bit(ob->dev, req->devs_may_alloc.d);
-
-       ret = bucket_alloc_set_writepoint(c, req);
-       if (ret)
-               return ret;
-
-       ret = bucket_alloc_set_partial(c, req);
-       if (ret)
-               return ret;
-
-       if (req->ec) {
-               ret = bucket_alloc_from_stripe(trans, req, _cl);
-       } else {
-retry_blocking:
-               /*
-                * Try nonblocking first, so that if one device is full we'll try from
-                * other devices:
-                */
-               ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl);
-               if (ret &&
-                   !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-                   !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
-                   !cl && _cl) {
-                       cl = _cl;
-                       goto retry_blocking;
-               }
-       }
-
-       return ret;
-}
-
-static int open_bucket_add_buckets(struct btree_trans *trans,
-                                  struct alloc_request *req,
-                                  struct closure *cl)
-{
-       int ret;
-
-       if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) {
-               ret = __open_bucket_add_buckets(trans, req, cl);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-                   bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
-                   bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
-                   bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
-                       return ret;
-               if (req->nr_effective >= req->nr_replicas)
-                       return 0;
-       }
-
-       bool ec = false;
-       swap(ec, req->ec);
-       ret = __open_bucket_add_buckets(trans, req, cl);
-       swap(ec, req->ec);
-
-       return ret < 0 ? ret : 0;
-}
-
-/**
- * should_drop_bucket - check if this is open_bucket should go away
- * @ob:                open_bucket to predicate on
- * @c:         filesystem handle
- * @ca:                if set, we're killing buckets for a particular device
- * @ec:                if true, we're shutting down erasure coding and killing all ec
- *             open_buckets
- *             otherwise, return true
- * Returns: true if we should kill this open_bucket
- *
- * We're killing open_buckets because we're shutting down a device, erasure
- * coding, or the entire filesystem - check if this open_bucket matches:
- */
-static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
-                              struct bch_dev *ca, bool ec)
-{
-       if (ec) {
-               return ob->ec != NULL;
-       } else if (ca) {
-               bool drop = ob->dev == ca->dev_idx;
-               struct open_bucket *ob2;
-               unsigned i;
-
-               if (!drop && ob->ec) {
-                       unsigned nr_blocks;
-
-                       mutex_lock(&ob->ec->lock);
-                       nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
-
-                       for (i = 0; i < nr_blocks; i++) {
-                               if (!ob->ec->blocks[i])
-                                       continue;
-
-                               ob2 = c->open_buckets + ob->ec->blocks[i];
-                               drop |= ob2->dev == ca->dev_idx;
-                       }
-                       mutex_unlock(&ob->ec->lock);
-               }
-
-               return drop;
-       } else {
-               return true;
-       }
-}
-
-static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
-                                bool ec, struct write_point *wp)
-{
-       struct open_buckets ptrs = { .nr = 0 };
-       struct open_bucket *ob;
-       unsigned i;
-
-       mutex_lock(&wp->lock);
-       open_bucket_for_each(c, &wp->ptrs, ob, i)
-               if (should_drop_bucket(ob, c, ca, ec))
-                       bch2_open_bucket_put(c, ob);
-               else
-                       ob_push(c, &ptrs, ob);
-       wp->ptrs = ptrs;
-       mutex_unlock(&wp->lock);
-}
-
-void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
-                           bool ec)
-{
-       unsigned i;
-
-       /* Next, close write points that point to this device... */
-       for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-               bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
-
-       bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
-       bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
-       bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
-
-       mutex_lock(&c->btree_reserve_cache_lock);
-       while (c->btree_reserve_cache_nr) {
-               struct btree_alloc *a =
-                       &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-               bch2_open_buckets_put(c, &a->ob);
-       }
-       mutex_unlock(&c->btree_reserve_cache_lock);
-
-       spin_lock(&c->freelist_lock);
-       i = 0;
-       while (i < c->open_buckets_partial_nr) {
-               struct open_bucket *ob =
-                       c->open_buckets + c->open_buckets_partial[i];
-
-               if (should_drop_bucket(ob, c, ca, ec)) {
-                       --c->open_buckets_partial_nr;
-                       swap(c->open_buckets_partial[i],
-                            c->open_buckets_partial[c->open_buckets_partial_nr]);
-
-                       ob->on_partial_list = false;
-
-                       scoped_guard(rcu)
-                               bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
-
-                       spin_unlock(&c->freelist_lock);
-                       bch2_open_bucket_put(c, ob);
-                       spin_lock(&c->freelist_lock);
-               } else {
-                       i++;
-               }
-       }
-       spin_unlock(&c->freelist_lock);
-
-       bch2_ec_stop_dev(c, ca);
-}
-
-static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
-                                                unsigned long write_point)
-{
-       unsigned hash =
-               hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
-
-       return &c->write_points_hash[hash];
-}
-
-static struct write_point *__writepoint_find(struct hlist_head *head,
-                                            unsigned long write_point)
-{
-       struct write_point *wp;
-
-       guard(rcu)();
-       hlist_for_each_entry_rcu(wp, head, node)
-               if (wp->write_point == write_point)
-                       return wp;
-       return NULL;
-}
-
-static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
-{
-       u64 stranded    = c->write_points_nr * c->bucket_size_max;
-       u64 free        = bch2_fs_usage_read_short(c).free;
-
-       return stranded * factor > free;
-}
-
-static noinline bool try_increase_writepoints(struct bch_fs *c)
-{
-       struct write_point *wp;
-
-       if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
-           too_many_writepoints(c, 32))
-               return false;
-
-       wp = c->write_points + c->write_points_nr++;
-       hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
-       return true;
-}
-
-static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
-{
-       struct bch_fs *c = trans->c;
-       struct write_point *wp;
-       struct open_bucket *ob;
-       unsigned i;
-
-       mutex_lock(&c->write_points_hash_lock);
-       if (c->write_points_nr < old_nr) {
-               mutex_unlock(&c->write_points_hash_lock);
-               return true;
-       }
-
-       if (c->write_points_nr == 1 ||
-           !too_many_writepoints(c, 8)) {
-               mutex_unlock(&c->write_points_hash_lock);
-               return false;
-       }
-
-       wp = c->write_points + --c->write_points_nr;
-
-       hlist_del_rcu(&wp->node);
-       mutex_unlock(&c->write_points_hash_lock);
-
-       bch2_trans_mutex_lock_norelock(trans, &wp->lock);
-       open_bucket_for_each(c, &wp->ptrs, ob, i)
-               open_bucket_free_unused(c, ob);
-       wp->ptrs.nr = 0;
-       mutex_unlock(&wp->lock);
-       return true;
-}
-
-static struct write_point *writepoint_find(struct btree_trans *trans,
-                                          unsigned long write_point)
-{
-       struct bch_fs *c = trans->c;
-       struct write_point *wp, *oldest;
-       struct hlist_head *head;
-
-       if (!(write_point & 1UL)) {
-               wp = (struct write_point *) write_point;
-               bch2_trans_mutex_lock_norelock(trans, &wp->lock);
-               return wp;
-       }
-
-       head = writepoint_hash(c, write_point);
-restart_find:
-       wp = __writepoint_find(head, write_point);
-       if (wp) {
-lock_wp:
-               bch2_trans_mutex_lock_norelock(trans, &wp->lock);
-               if (wp->write_point == write_point)
-                       goto out;
-               mutex_unlock(&wp->lock);
-               goto restart_find;
-       }
-restart_find_oldest:
-       oldest = NULL;
-       for (wp = c->write_points;
-            wp < c->write_points + c->write_points_nr; wp++)
-               if (!oldest || time_before64(wp->last_used, oldest->last_used))
-                       oldest = wp;
-
-       bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
-       bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
-       if (oldest >= c->write_points + c->write_points_nr ||
-           try_increase_writepoints(c)) {
-               mutex_unlock(&c->write_points_hash_lock);
-               mutex_unlock(&oldest->lock);
-               goto restart_find_oldest;
-       }
-
-       wp = __writepoint_find(head, write_point);
-       if (wp && wp != oldest) {
-               mutex_unlock(&c->write_points_hash_lock);
-               mutex_unlock(&oldest->lock);
-               goto lock_wp;
-       }
-
-       wp = oldest;
-       hlist_del_rcu(&wp->node);
-       wp->write_point = write_point;
-       hlist_add_head_rcu(&wp->node, head);
-       mutex_unlock(&c->write_points_hash_lock);
-out:
-       wp->last_used = local_clock();
-       return wp;
-}
-
-static noinline void
-deallocate_extra_replicas(struct bch_fs *c,
-                         struct alloc_request *req)
-{
-       struct open_bucket *ob;
-       unsigned extra_replicas = req->nr_effective - req->nr_replicas;
-       unsigned i;
-
-       req->scratch_ptrs.nr = 0;
-
-       open_bucket_for_each(c, &req->ptrs, ob, i) {
-               unsigned d = ob_dev(c, ob)->mi.durability;
-
-               if (d && d <= extra_replicas) {
-                       extra_replicas -= d;
-                       ob_push(c, &req->wp->ptrs, ob);
-               } else {
-                       ob_push(c, &req->scratch_ptrs, ob);
-               }
-       }
-
-       req->ptrs = req->scratch_ptrs;
-}
-
-/*
- * Get us an open_bucket we can allocate from, return with it locked:
- */
-int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
-                            unsigned target,
-                            unsigned erasure_code,
-                            struct write_point_specifier write_point,
-                            struct bch_devs_list *devs_have,
-                            unsigned nr_replicas,
-                            unsigned nr_replicas_required,
-                            enum bch_watermark watermark,
-                            enum bch_write_flags flags,
-                            struct closure *cl,
-                            struct write_point **wp_ret)
-{
-       struct bch_fs *c = trans->c;
-       struct open_bucket *ob;
-       unsigned write_points_nr;
-       int i;
-
-       struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req));
-       int ret = PTR_ERR_OR_ZERO(req);
-       if (unlikely(ret))
-               return ret;
-
-       if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
-               erasure_code = false;
-
-       req->nr_replicas        = nr_replicas;
-       req->target             = target;
-       req->ec                 = erasure_code;
-       req->watermark          = watermark;
-       req->flags              = flags;
-       req->devs_have          = devs_have;
-
-       BUG_ON(!nr_replicas || !nr_replicas_required);
-retry:
-       req->ptrs.nr            = 0;
-       req->nr_effective       = 0;
-       req->have_cache         = false;
-       write_points_nr         = c->write_points_nr;
-
-       *wp_ret = req->wp = writepoint_find(trans, write_point.v);
-
-       req->data_type          = req->wp->data_type;
-
-       ret = bch2_trans_relock(trans);
-       if (ret)
-               goto err;
-
-       /* metadata may not allocate on cache devices: */
-       if (req->data_type != BCH_DATA_user)
-               req->have_cache = true;
-
-       if (target && !(flags & BCH_WRITE_only_specified_devs)) {
-               ret = open_bucket_add_buckets(trans, req, NULL);
-               if (!ret ||
-                   bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       goto alloc_done;
-
-               /* Don't retry from all devices if we're out of open buckets: */
-               if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
-                       int ret2 = open_bucket_add_buckets(trans, req, cl);
-                       if (!ret2 ||
-                           bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
-                           bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
-                               ret = ret2;
-                               goto alloc_done;
-                       }
-               }
-
-               /*
-                * Only try to allocate cache (durability = 0 devices) from the
-                * specified target:
-                */
-               req->have_cache = true;
-               req->target     = 0;
-
-               ret = open_bucket_add_buckets(trans, req, cl);
-       } else {
-               ret = open_bucket_add_buckets(trans, req, cl);
-       }
-alloc_done:
-       BUG_ON(!ret && req->nr_effective < req->nr_replicas);
-
-       if (erasure_code && !ec_open_bucket(c, &req->ptrs))
-               pr_debug("failed to get ec bucket: ret %u", ret);
-
-       if (ret == -BCH_ERR_insufficient_devices &&
-           req->nr_effective >= nr_replicas_required)
-               ret = 0;
-
-       if (ret)
-               goto err;
-
-       if (req->nr_effective > req->nr_replicas)
-               deallocate_extra_replicas(c, req);
-
-       /* Free buckets we didn't use: */
-       open_bucket_for_each(c, &req->wp->ptrs, ob, i)
-               open_bucket_free_unused(c, ob);
-
-       req->wp->ptrs = req->ptrs;
-
-       req->wp->sectors_free = UINT_MAX;
-
-       open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
-               /*
-                * Ensure proper write alignment - either due to misaligned
-                * bucket sizes (from buggy bcachefs-tools), or writes that mix
-                * logical/physical alignment:
-                */
-               struct bch_dev *ca = ob_dev(c, ob);
-               u64 offset = bucket_to_sector(ca, ob->bucket) +
-                       ca->mi.bucket_size -
-                       ob->sectors_free;
-               unsigned align = round_up(offset, block_sectors(c)) - offset;
-
-               ob->sectors_free = max_t(int, 0, ob->sectors_free - align);
-
-               req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free);
-       }
-
-       req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c));
-
-       /* Did alignment use up space in an open_bucket? */
-       if (unlikely(!req->wp->sectors_free)) {
-               bch2_alloc_sectors_done(c, req->wp);
-               goto retry;
-       }
-
-       BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX);
-
-       return 0;
-err:
-       open_bucket_for_each(c, &req->wp->ptrs, ob, i)
-               if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v))
-                       ob_push(c, &req->ptrs, ob);
-               else
-                       open_bucket_free_unused(c, ob);
-       req->wp->ptrs = req->ptrs;
-
-       mutex_unlock(&req->wp->lock);
-
-       if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
-           try_decrease_writepoints(trans, write_points_nr))
-               goto retry;
-
-       if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
-               ret = bch_err_throw(c, bucket_alloc_blocked);
-
-       if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
-           bch2_err_matches(ret, BCH_ERR_freelist_empty))
-               ret = bch_err_throw(c, bucket_alloc_blocked);
-
-       return ret;
-}
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
-                                   struct bkey_i *k, unsigned sectors,
-                                   bool cached)
-{
-       bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
-{
-       bch2_alloc_sectors_done_inlined(c, wp);
-}
-
-static inline void writepoint_init(struct write_point *wp,
-                                  enum bch_data_type type)
-{
-       mutex_init(&wp->lock);
-       wp->data_type = type;
-
-       INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
-       INIT_LIST_HEAD(&wp->writes);
-       spin_lock_init(&wp->writes_lock);
-}
-
-void bch2_fs_allocator_foreground_init(struct bch_fs *c)
-{
-       struct open_bucket *ob;
-       struct write_point *wp;
-
-       mutex_init(&c->write_points_hash_lock);
-       c->write_points_nr = ARRAY_SIZE(c->write_points);
-
-       /* open bucket 0 is a sentinal NULL: */
-       spin_lock_init(&c->open_buckets[0].lock);
-
-       for (ob = c->open_buckets + 1;
-            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
-               spin_lock_init(&ob->lock);
-               c->open_buckets_nr_free++;
-
-               ob->freelist = c->open_buckets_freelist;
-               c->open_buckets_freelist = ob - c->open_buckets;
-       }
-
-       writepoint_init(&c->btree_write_point,          BCH_DATA_btree);
-       writepoint_init(&c->rebalance_write_point,      BCH_DATA_user);
-       writepoint_init(&c->copygc_write_point,         BCH_DATA_user);
-
-       for (wp = c->write_points;
-            wp < c->write_points + c->write_points_nr; wp++) {
-               writepoint_init(wp, BCH_DATA_user);
-
-               wp->last_used   = local_clock();
-               wp->write_point = (unsigned long) wp;
-               hlist_add_head_rcu(&wp->node,
-                                  writepoint_hash(c, wp->write_point));
-       }
-}
-
-void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
-{
-       struct bch_dev *ca = ob_dev(c, ob);
-       unsigned data_type = ob->data_type;
-       barrier(); /* READ_ONCE() doesn't work on bitfields */
-
-       prt_printf(out, "%zu ref %u ",
-                  ob - c->open_buckets,
-                  atomic_read(&ob->pin));
-       bch2_prt_data_type(out, data_type);
-       prt_printf(out, " %u:%llu gen %u allocated %u/%u",
-                  ob->dev, ob->bucket, ob->gen,
-                  ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
-       if (ob->ec)
-               prt_printf(out, " ec idx %llu", ob->ec->idx);
-       if (ob->on_partial_list)
-               prt_str(out, " partial");
-       prt_newline(out);
-}
-
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
-                              struct bch_dev *ca)
-{
-       struct open_bucket *ob;
-
-       out->atomic++;
-
-       for (ob = c->open_buckets;
-            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-            ob++) {
-               spin_lock(&ob->lock);
-               if (ob->valid && (!ca || ob->dev == ca->dev_idx))
-                       bch2_open_bucket_to_text(out, c, ob);
-               spin_unlock(&ob->lock);
-       }
-
-       --out->atomic;
-}
-
-void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       unsigned i;
-
-       out->atomic++;
-       spin_lock(&c->freelist_lock);
-
-       for (i = 0; i < c->open_buckets_partial_nr; i++)
-               bch2_open_bucket_to_text(out, c,
-                               c->open_buckets + c->open_buckets_partial[i]);
-
-       spin_unlock(&c->freelist_lock);
-       --out->atomic;
-}
-
-static const char * const bch2_write_point_states[] = {
-#define x(n)   #n,
-       WRITE_POINT_STATES()
-#undef x
-       NULL
-};
-
-static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
-                                    struct write_point *wp)
-{
-       struct open_bucket *ob;
-       unsigned i;
-
-       mutex_lock(&wp->lock);
-
-       prt_printf(out, "%lu: ", wp->write_point);
-       prt_human_readable_u64(out, wp->sectors_allocated << 9);
-
-       prt_printf(out, " last wrote: ");
-       bch2_pr_time_units(out, sched_clock() - wp->last_used);
-
-       for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
-               prt_printf(out, " %s: ", bch2_write_point_states[i]);
-               bch2_pr_time_units(out, wp->time[i]);
-       }
-
-       prt_newline(out);
-
-       printbuf_indent_add(out, 2);
-       open_bucket_for_each(c, &wp->ptrs, ob, i)
-               bch2_open_bucket_to_text(out, c, ob);
-       printbuf_indent_sub(out, 2);
-
-       mutex_unlock(&wp->lock);
-}
-
-void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       struct write_point *wp;
-
-       prt_str(out, "Foreground write points\n");
-       for (wp = c->write_points;
-            wp < c->write_points + ARRAY_SIZE(c->write_points);
-            wp++)
-               bch2_write_point_to_text(out, c, wp);
-
-       prt_str(out, "Copygc write point\n");
-       bch2_write_point_to_text(out, c, &c->copygc_write_point);
-
-       prt_str(out, "Rebalance write point\n");
-       bch2_write_point_to_text(out, c, &c->rebalance_write_point);
-
-       prt_str(out, "Btree write point\n");
-       bch2_write_point_to_text(out, c, &c->btree_write_point);
-}
-
-void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       unsigned nr[BCH_DATA_NR];
-
-       memset(nr, 0, sizeof(nr));
-
-       for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
-               nr[c->open_buckets[i].data_type]++;
-
-       printbuf_tabstops_reset(out);
-       printbuf_tabstop_push(out, 24);
-
-       prt_printf(out, "capacity\t%llu\n",             c->capacity);
-       prt_printf(out, "reserved\t%llu\n",             c->reserved);
-       prt_printf(out, "hidden\t%llu\n",               percpu_u64_get(&c->usage->hidden));
-       prt_printf(out, "btree\t%llu\n",                percpu_u64_get(&c->usage->btree));
-       prt_printf(out, "data\t%llu\n",                 percpu_u64_get(&c->usage->data));
-       prt_printf(out, "cached\t%llu\n",               percpu_u64_get(&c->usage->cached));
-       prt_printf(out, "reserved\t%llu\n",             percpu_u64_get(&c->usage->reserved));
-       prt_printf(out, "online_reserved\t%llu\n",      percpu_u64_get(c->online_reserved));
-       prt_printf(out, "nr_inodes\t%llu\n",            percpu_u64_get(&c->usage->nr_inodes));
-
-       prt_newline(out);
-       prt_printf(out, "freelist_wait\t%s\n",                  c->freelist_wait.list.first ? "waiting" : "empty");
-       prt_printf(out, "open buckets allocated\t%i\n",         OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
-       prt_printf(out, "open buckets total\t%u\n",             OPEN_BUCKETS_COUNT);
-       prt_printf(out, "open_buckets_wait\t%s\n",              c->open_buckets_wait.list.first ? "waiting" : "empty");
-       prt_printf(out, "open_buckets_btree\t%u\n",             nr[BCH_DATA_btree]);
-       prt_printf(out, "open_buckets_user\t%u\n",              nr[BCH_DATA_user]);
-       prt_printf(out, "btree reserve cache\t%u\n",            c->btree_reserve_cache_nr);
-}
-
-void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-       struct bch_fs *c = ca->fs;
-       struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca);
-       unsigned nr[BCH_DATA_NR];
-
-       memset(nr, 0, sizeof(nr));
-
-       for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
-               nr[c->open_buckets[i].data_type]++;
-
-       bch2_dev_usage_to_text(out, ca, &stats);
-
-       prt_newline(out);
-
-       prt_printf(out, "reserves:\n");
-       for (unsigned i = 0; i < BCH_WATERMARK_NR; i++)
-               prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i));
-
-       prt_newline(out);
-
-       printbuf_tabstops_reset(out);
-       printbuf_tabstop_push(out, 12);
-       printbuf_tabstop_push(out, 16);
-
-       prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
-       prt_printf(out, "buckets to invalidate\t%llu\r\n",
-                  should_invalidate_buckets(ca, bch2_dev_usage_read(ca)));
-}
-
-static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
-{
-       struct printbuf buf = PRINTBUF;
-
-       prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n",
-                  c->opts.allocator_stuck_timeout);
-
-       prt_printf(&buf, "Allocator debug:\n");
-       printbuf_indent_add(&buf, 2);
-       bch2_fs_alloc_debug_to_text(&buf, c);
-       printbuf_indent_sub(&buf, 2);
-       prt_newline(&buf);
-
-       bch2_printbuf_make_room(&buf, 4096);
-
-       buf.atomic++;
-       scoped_guard(rcu)
-               for_each_online_member_rcu(c, ca) {
-                       prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
-                       printbuf_indent_add(&buf, 2);
-                       bch2_dev_alloc_debug_to_text(&buf, ca);
-                       printbuf_indent_sub(&buf, 2);
-                       prt_newline(&buf);
-               }
-       --buf.atomic;
-
-       prt_printf(&buf, "Copygc debug:\n");
-       printbuf_indent_add(&buf, 2);
-       bch2_copygc_wait_to_text(&buf, c);
-       printbuf_indent_sub(&buf, 2);
-       prt_newline(&buf);
-
-       prt_printf(&buf, "Journal debug:\n");
-       printbuf_indent_add(&buf, 2);
-       bch2_journal_debug_to_text(&buf, &c->journal);
-       printbuf_indent_sub(&buf, 2);
-
-       bch2_print_str(c, KERN_ERR, buf.buf);
-       printbuf_exit(&buf);
-}
-
-static inline unsigned allocator_wait_timeout(struct bch_fs *c)
-{
-       if (c->allocator_last_stuck &&
-           time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies))
-               return 0;
-
-       return c->opts.allocator_stuck_timeout * HZ;
-}
-
-void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
-{
-       unsigned t = allocator_wait_timeout(c);
-
-       if (t && closure_sync_timeout(cl, t)) {
-               c->allocator_last_stuck = jiffies;
-               bch2_print_allocator_stuck(c);
-       }
-
-       closure_sync(cl);
-}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
deleted file mode 100644 (file)
index 1b3fc84..0000000
+++ /dev/null
@@ -1,318 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
-#define _BCACHEFS_ALLOC_FOREGROUND_H
-
-#include "bcachefs.h"
-#include "buckets.h"
-#include "alloc_types.h"
-#include "extents.h"
-#include "io_write_types.h"
-#include "sb-members.h"
-
-#include <linux/hash.h>
-
-struct bkey;
-struct bch_dev;
-struct bch_fs;
-struct bch_devs_List;
-
-extern const char * const bch2_watermarks[];
-
-void bch2_reset_alloc_cursors(struct bch_fs *);
-
-struct dev_alloc_list {
-       unsigned        nr;
-       u8              data[BCH_SB_MEMBERS_MAX];
-};
-
-struct alloc_request {
-       unsigned                nr_replicas;
-       unsigned                target;
-       bool                    ec;
-       enum bch_watermark      watermark;
-       enum bch_write_flags    flags;
-       enum bch_data_type      data_type;
-       struct bch_devs_list    *devs_have;
-       struct write_point      *wp;
-
-       /* These fields are used primarily by open_bucket_add_buckets */
-       struct open_buckets     ptrs;
-       unsigned                nr_effective;   /* sum of @ptrs durability */
-       bool                    have_cache;     /* have we allocated from a 0 durability dev */
-       struct bch_devs_mask    devs_may_alloc;
-
-       /* bch2_bucket_alloc_set_trans(): */
-       struct dev_alloc_list   devs_sorted;
-       struct bch_dev_usage    usage;
-
-       /* bch2_bucket_alloc_trans(): */
-       struct bch_dev          *ca;
-
-       enum {
-                               BTREE_BITMAP_NO,
-                               BTREE_BITMAP_YES,
-                               BTREE_BITMAP_ANY,
-       }                       btree_bitmap;
-
-       struct {
-               u64             buckets_seen;
-               u64             skipped_open;
-               u64             skipped_need_journal_commit;
-               u64             need_journal_commit;
-               u64             skipped_nocow;
-               u64             skipped_nouse;
-               u64             skipped_mi_btree_bitmap;
-       } counters;
-
-       unsigned                scratch_nr_replicas;
-       unsigned                scratch_nr_effective;
-       bool                    scratch_have_cache;
-       enum bch_data_type      scratch_data_type;
-       struct open_buckets     scratch_ptrs;
-       struct bch_devs_mask    scratch_devs_may_alloc;
-};
-
-void bch2_dev_alloc_list(struct bch_fs *,
-                        struct dev_stripe_state *,
-                        struct bch_devs_mask *,
-                        struct dev_alloc_list *);
-void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
-
-static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
-{
-       return bch2_dev_have_ref(c, ob->dev);
-}
-
-static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark)
-{
-       switch (watermark) {
-       case BCH_WATERMARK_interior_updates:
-               return 0;
-       case BCH_WATERMARK_reclaim:
-               return OPEN_BUCKETS_COUNT / 6;
-       case BCH_WATERMARK_btree:
-       case BCH_WATERMARK_btree_copygc:
-               return OPEN_BUCKETS_COUNT / 4;
-       case BCH_WATERMARK_copygc:
-               return OPEN_BUCKETS_COUNT / 3;
-       default:
-               return OPEN_BUCKETS_COUNT / 2;
-       }
-}
-
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
-                                     enum bch_watermark, enum bch_data_type,
-                                     struct closure *);
-
-static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
-                          struct open_bucket *ob)
-{
-       BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
-
-       obs->v[obs->nr++] = ob - c->open_buckets;
-}
-
-#define open_bucket_for_each(_c, _obs, _ob, _i)                                \
-       for ((_i) = 0;                                                  \
-            (_i) < (_obs)->nr &&                                       \
-            ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);        \
-            (_i)++)
-
-static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
-                                                struct open_buckets *obs)
-{
-       struct open_bucket *ob;
-       unsigned i;
-
-       open_bucket_for_each(c, obs, ob, i)
-               if (ob->ec)
-                       return ob;
-
-       return NULL;
-}
-
-void bch2_open_bucket_write_error(struct bch_fs *,
-                       struct open_buckets *, unsigned, int);
-
-void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
-
-static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-       if (atomic_dec_and_test(&ob->pin))
-               __bch2_open_bucket_put(c, ob);
-}
-
-static inline void bch2_open_buckets_put(struct bch_fs *c,
-                                        struct open_buckets *ptrs)
-{
-       struct open_bucket *ob;
-       unsigned i;
-
-       open_bucket_for_each(c, ptrs, ob, i)
-               bch2_open_bucket_put(c, ob);
-       ptrs->nr = 0;
-}
-
-static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
-{
-       struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
-       struct open_bucket *ob;
-       unsigned i;
-
-       open_bucket_for_each(c, &wp->ptrs, ob, i)
-               ob_push(c, ob->sectors_free < block_sectors(c)
-                       ? &ptrs
-                       : &keep, ob);
-       wp->ptrs = keep;
-
-       mutex_unlock(&wp->lock);
-
-       bch2_open_buckets_put(c, &ptrs);
-}
-
-static inline void bch2_open_bucket_get(struct bch_fs *c,
-                                       struct write_point *wp,
-                                       struct open_buckets *ptrs)
-{
-       struct open_bucket *ob;
-       unsigned i;
-
-       open_bucket_for_each(c, &wp->ptrs, ob, i) {
-               ob->data_type = wp->data_type;
-               atomic_inc(&ob->pin);
-               ob_push(c, ptrs, ob);
-       }
-}
-
-static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
-                                                 unsigned dev, u64 bucket)
-{
-       return c->open_buckets_hash +
-               (jhash_3words(dev, bucket, bucket >> 32, 0) &
-                (OPEN_BUCKETS_COUNT - 1));
-}
-
-static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
-{
-       open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
-
-       while (slot) {
-               struct open_bucket *ob = &c->open_buckets[slot];
-
-               if (ob->dev == dev && ob->bucket == bucket)
-                       return true;
-
-               slot = ob->hash;
-       }
-
-       return false;
-}
-
-static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
-{
-       bool ret;
-
-       if (bch2_bucket_is_open(c, dev, bucket))
-               return true;
-
-       spin_lock(&c->freelist_lock);
-       ret = bch2_bucket_is_open(c, dev, bucket);
-       spin_unlock(&c->freelist_lock);
-
-       return ret;
-}
-
-enum bch_write_flags;
-int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *,
-                               struct dev_stripe_state *, struct closure *);
-
-int bch2_alloc_sectors_start_trans(struct btree_trans *,
-                                  unsigned, unsigned,
-                                  struct write_point_specifier,
-                                  struct bch_devs_list *,
-                                  unsigned, unsigned,
-                                  enum bch_watermark,
-                                  enum bch_write_flags,
-                                  struct closure *,
-                                  struct write_point **);
-
-static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
-{
-       struct bch_dev *ca = ob_dev(c, ob);
-
-       return (struct bch_extent_ptr) {
-               .type   = 1 << BCH_EXTENT_ENTRY_ptr,
-               .gen    = ob->gen,
-               .dev    = ob->dev,
-               .offset = bucket_to_sector(ca, ob->bucket) +
-                       ca->mi.bucket_size -
-                       ob->sectors_free,
-       };
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-static inline void
-bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
-                                      struct bkey_i *k, unsigned sectors,
-                                      bool cached)
-{
-       struct open_bucket *ob;
-       unsigned i;
-
-       BUG_ON(sectors > wp->sectors_free);
-       wp->sectors_free        -= sectors;
-       wp->sectors_allocated   += sectors;
-
-       open_bucket_for_each(c, &wp->ptrs, ob, i) {
-               struct bch_dev *ca = ob_dev(c, ob);
-               struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
-
-               ptr.cached = cached ||
-                       (!ca->mi.durability &&
-                        wp->data_type == BCH_DATA_user);
-
-               bch2_bkey_append_ptr(k, ptr);
-
-               BUG_ON(sectors > ob->sectors_free);
-               ob->sectors_free -= sectors;
-       }
-}
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
-                                   struct bkey_i *, unsigned, bool);
-void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
-
-void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
-
-static inline struct write_point_specifier writepoint_hashed(unsigned long v)
-{
-       return (struct write_point_specifier) { .v = v | 1 };
-}
-
-static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
-{
-       return (struct write_point_specifier) { .v = (unsigned long) wp };
-}
-
-void bch2_fs_allocator_foreground_init(struct bch_fs *);
-
-void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *);
-void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
-void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
-
-void __bch2_wait_on_allocator(struct bch_fs *, struct closure *);
-static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
-{
-       if (cl->closure_get_happened)
-               __bch2_wait_on_allocator(c, cl);
-}
-
-#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
deleted file mode 100644 (file)
index e7becdf..0000000
+++ /dev/null
@@ -1,121 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_TYPES_H
-#define _BCACHEFS_ALLOC_TYPES_H
-
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include "clock_types.h"
-#include "fifo.h"
-
-#define BCH_WATERMARKS()               \
-       x(stripe)                       \
-       x(normal)                       \
-       x(copygc)                       \
-       x(btree)                        \
-       x(btree_copygc)                 \
-       x(reclaim)                      \
-       x(interior_updates)
-
-enum bch_watermark {
-#define x(name)        BCH_WATERMARK_##name,
-       BCH_WATERMARKS()
-#undef x
-       BCH_WATERMARK_NR,
-};
-
-#define BCH_WATERMARK_BITS     3
-#define BCH_WATERMARK_MASK     ~(~0U << BCH_WATERMARK_BITS)
-
-#define OPEN_BUCKETS_COUNT     1024
-
-#define WRITE_POINT_HASH_NR    32
-#define WRITE_POINT_MAX                32
-
-/*
- * 0 is never a valid open_bucket_idx_t:
- */
-typedef u16                    open_bucket_idx_t;
-
-struct open_bucket {
-       spinlock_t              lock;
-       atomic_t                pin;
-       open_bucket_idx_t       freelist;
-       open_bucket_idx_t       hash;
-
-       /*
-        * When an open bucket has an ec_stripe attached, this is the index of
-        * the block in the stripe this open_bucket corresponds to:
-        */
-       u8                      ec_idx;
-       enum bch_data_type      data_type:6;
-       unsigned                valid:1;
-       unsigned                on_partial_list:1;
-
-       u8                      dev;
-       u8                      gen;
-       u32                     sectors_free;
-       u64                     bucket;
-       struct ec_stripe_new    *ec;
-};
-
-#define OPEN_BUCKET_LIST_MAX   15
-
-struct open_buckets {
-       open_bucket_idx_t       nr;
-       open_bucket_idx_t       v[OPEN_BUCKET_LIST_MAX];
-};
-
-struct dev_stripe_state {
-       u64                     next_alloc[BCH_SB_MEMBERS_MAX];
-};
-
-#define WRITE_POINT_STATES()           \
-       x(stopped)                      \
-       x(waiting_io)                   \
-       x(waiting_work)                 \
-       x(runnable)                     \
-       x(running)
-
-enum write_point_state {
-#define x(n)   WRITE_POINT_##n,
-       WRITE_POINT_STATES()
-#undef x
-       WRITE_POINT_STATE_NR
-};
-
-struct write_point {
-       struct {
-               struct hlist_node       node;
-               struct mutex            lock;
-               u64                     last_used;
-               unsigned long           write_point;
-               enum bch_data_type      data_type;
-
-               /* calculated based on how many pointers we're actually going to use: */
-               unsigned                sectors_free;
-
-               struct open_buckets     ptrs;
-               struct dev_stripe_state stripe;
-
-               u64                     sectors_allocated;
-       } __aligned(SMP_CACHE_BYTES);
-
-       struct {
-               struct work_struct      index_update_work;
-
-               struct list_head        writes;
-               spinlock_t              writes_lock;
-
-               enum write_point_state  state;
-               u64                     last_state_change;
-               u64                     time[WRITE_POINT_STATE_NR];
-               u64                     last_runtime;
-       } __aligned(SMP_CACHE_BYTES);
-};
-
-struct write_point_specifier {
-       unsigned long           v;
-};
-
-#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c
deleted file mode 100644 (file)
index a7cd1f0..0000000
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Async obj debugging: keep asynchronous objects on (very fast) lists, make
- * them visibile in debugfs:
- */
-
-#include "bcachefs.h"
-#include "async_objs.h"
-#include "btree_io.h"
-#include "debug.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/debugfs.h>
-
-static void promote_obj_to_text(struct printbuf *out, void *obj)
-{
-       bch2_promote_op_to_text(out, obj);
-}
-
-static void rbio_obj_to_text(struct printbuf *out, void *obj)
-{
-       bch2_read_bio_to_text(out, obj);
-}
-
-static void write_op_obj_to_text(struct printbuf *out, void *obj)
-{
-       bch2_write_op_to_text(out, obj);
-}
-
-static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj)
-{
-       struct btree_read_bio *rbio = obj;
-       bch2_btree_read_bio_to_text(out, rbio);
-}
-
-static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj)
-{
-       struct btree_write_bio *wbio = obj;
-       bch2_bio_to_text(out, &wbio->wbio.bio);
-}
-
-static int bch2_async_obj_list_open(struct inode *inode, struct file *file)
-{
-       struct async_obj_list *list = inode->i_private;
-       struct dump_iter *i;
-
-       i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-       if (!i)
-               return -ENOMEM;
-
-       file->private_data = i;
-       i->from = POS_MIN;
-       i->iter = 0;
-       i->c    = container_of(list, struct bch_fs, async_objs[list->idx]);
-       i->list = list;
-       i->buf  = PRINTBUF;
-       return 0;
-}
-
-static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf,
-                                       size_t size, loff_t *ppos)
-{
-       struct dump_iter *i = file->private_data;
-       struct async_obj_list *list = i->list;
-       ssize_t ret = 0;
-
-       i->ubuf = buf;
-       i->size = size;
-       i->ret  = 0;
-
-       struct genradix_iter iter;
-       void *obj;
-       fast_list_for_each_from(&list->list, iter, obj, i->iter) {
-               ret = bch2_debugfs_flush_buf(i);
-               if (ret)
-                       return ret;
-
-               if (!i->size)
-                       break;
-
-               list->obj_to_text(&i->buf, obj);
-       }
-
-       if (i->buf.allocation_failure)
-               ret = -ENOMEM;
-       else
-               i->iter = iter.pos;
-
-       if (!ret)
-               ret = bch2_debugfs_flush_buf(i);
-
-       return ret ?: i->ret;
-}
-
-static const struct file_operations async_obj_ops = {
-       .owner          = THIS_MODULE,
-       .open           = bch2_async_obj_list_open,
-       .release        = bch2_dump_release,
-       .read           = bch2_async_obj_list_read,
-};
-
-void bch2_fs_async_obj_debugfs_init(struct bch_fs *c)
-{
-       c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir);
-
-#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir,           \
-                           &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops);
-       BCH_ASYNC_OBJ_LISTS()
-#undef x
-}
-
-void bch2_fs_async_obj_exit(struct bch_fs *c)
-{
-       for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++)
-               fast_list_exit(&c->async_objs[i].list);
-}
-
-int bch2_fs_async_obj_init(struct bch_fs *c)
-{
-       for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) {
-               if (fast_list_init(&c->async_objs[i].list))
-                       return -BCH_ERR_ENOMEM_async_obj_init;
-               c->async_objs[i].idx = i;
-       }
-
-#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text;
-       BCH_ASYNC_OBJ_LISTS()
-#undef x
-
-       return 0;
-}
diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h
deleted file mode 100644 (file)
index cd6489b..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ASYNC_OBJS_H
-#define _BCACHEFS_ASYNC_OBJS_H
-
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-static inline void __async_object_list_del(struct fast_list *head, unsigned idx)
-{
-       fast_list_remove(head, idx);
-}
-
-static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx)
-{
-       int ret = fast_list_add(head, obj);
-       *idx = ret > 0 ? ret : 0;
-       return ret < 0 ? ret : 0;
-}
-
-#define async_object_list_del(_c, _list, idx)          \
-       __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx)
-
-#define async_object_list_add(_c, _list, obj, idx)             \
-       __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx)
-
-void bch2_fs_async_obj_debugfs_init(struct bch_fs *);
-void bch2_fs_async_obj_exit(struct bch_fs *);
-int bch2_fs_async_obj_init(struct bch_fs *);
-
-#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
-
-#define async_object_list_del(_c, _n, idx)             do {} while (0)
-
-static inline int __async_object_list_add(void)
-{
-       return 0;
-}
-#define async_object_list_add(_c, _n, obj, idx)                __async_object_list_add()
-
-static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {}
-static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {}
-static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; }
-
-#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
-
-#endif /* _BCACHEFS_ASYNC_OBJS_H */
diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h
deleted file mode 100644 (file)
index 8d713c0..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H
-#define _BCACHEFS_ASYNC_OBJS_TYPES_H
-
-#define BCH_ASYNC_OBJ_LISTS()                                          \
-       x(promote)                                                      \
-       x(rbio)                                                         \
-       x(write_op)                                                     \
-       x(btree_read_bio)                                               \
-       x(btree_write_bio)
-
-enum bch_async_obj_lists {
-#define x(n)           BCH_ASYNC_OBJ_LIST_##n,
-       BCH_ASYNC_OBJ_LISTS()
-#undef x
-       BCH_ASYNC_OBJ_NR
-};
-
-struct async_obj_list {
-       struct fast_list        list;
-       void                    (*obj_to_text)(struct printbuf *, void *);
-       unsigned                idx;
-};
-
-#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
deleted file mode 100644 (file)
index 77d93be..0000000
+++ /dev/null
@@ -1,1391 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "checksum.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "progress.h"
-#include "recovery_passes.h"
-
-#include <linux/mm.h>
-
-static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64);
-
-static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
-{
-       return (struct bbpos) {
-               .btree  = bp.btree_id,
-               .pos    = bp.pos,
-       };
-}
-
-int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
-                             struct bkey_validate_context from)
-{
-       struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-       int ret = 0;
-
-       bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH,
-                        c, backpointer_level_bad,
-                        "backpointer level bad: %u >= %u",
-                        bp.v->level, BTREE_MAX_DEPTH);
-
-       bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID,
-                        c, backpointer_dev_bad,
-                        "backpointer for BCH_SB_MEMBER_INVALID");
-fsck_err:
-       return ret;
-}
-
-void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
-       struct bch_dev *ca;
-       u32 bucket_offset;
-       struct bpos bucket;
-       scoped_guard(rcu) {
-               ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
-               if (ca)
-                       bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
-       }
-
-       if (ca)
-               prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
-       else
-               prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
-
-       bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
-       prt_str(out, " data_type=");
-       bch2_prt_data_type(out, bp.v->data_type);
-       prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
-                  (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
-                  bp.v->bucket_len,
-                  bp.v->bucket_gen);
-       bch2_bpos_to_text(out, bp.v->pos);
-}
-
-void bch2_backpointer_swab(struct bkey_s k)
-{
-       struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
-
-       bp.v->bucket_len        = swab32(bp.v->bucket_len);
-       bch2_bpos_swab(&bp.v->pos);
-}
-
-static bool extent_matches_bp(struct bch_fs *c,
-                             enum btree_id btree_id, unsigned level,
-                             struct bkey_s_c k,
-                             struct bkey_s_c_backpointer bp)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               struct bkey_i_backpointer bp2;
-               bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2);
-
-               if (bpos_eq(bp.k->p, bp2.k.p) &&
-                   !memcmp(bp.v, &bp2.v, sizeof(bp2.v)))
-                       return true;
-       }
-
-       return false;
-}
-
-static noinline int backpointer_mod_err(struct btree_trans *trans,
-                                       struct bkey_s_c orig_k,
-                                       struct bkey_i_backpointer *new_bp,
-                                       struct bkey_s_c found_bp,
-                                       bool insert)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       bool will_check = c->recovery.passes_to_run &
-               BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
-       int ret = 0;
-
-       if (insert) {
-               prt_printf(&buf, "existing backpointer found when inserting ");
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
-               prt_newline(&buf);
-               printbuf_indent_add(&buf, 2);
-
-               prt_printf(&buf, "found ");
-               bch2_bkey_val_to_text(&buf, c, found_bp);
-               prt_newline(&buf);
-
-               prt_printf(&buf, "for ");
-               bch2_bkey_val_to_text(&buf, c, orig_k);
-       } else if (!will_check) {
-               prt_printf(&buf, "backpointer not found when deleting\n");
-               printbuf_indent_add(&buf, 2);
-
-               prt_printf(&buf, "searching for ");
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
-               prt_newline(&buf);
-
-               prt_printf(&buf, "got ");
-               bch2_bkey_val_to_text(&buf, c, found_bp);
-               prt_newline(&buf);
-
-               prt_printf(&buf, "for ");
-               bch2_bkey_val_to_text(&buf, c, orig_k);
-       }
-
-       if (!will_check && __bch2_inconsistent_error(c, &buf))
-               ret = bch_err_throw(c, erofs_unfixed_errors);
-
-       bch_err(c, "%s", buf.buf);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
-                               struct bkey_s_c orig_k,
-                               struct bkey_i_backpointer *bp,
-                               bool insert)
-{
-       struct btree_iter bp_iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
-                              bp->k.p,
-                              BTREE_ITER_intent|
-                              BTREE_ITER_slots|
-                              BTREE_ITER_with_updates);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       if (insert
-           ? k.k->type
-           : (k.k->type != KEY_TYPE_backpointer ||
-              memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) {
-               ret = backpointer_mod_err(trans, orig_k, bp, k, insert);
-               if (ret)
-                       goto err;
-       }
-
-       if (!insert) {
-               bp->k.type = KEY_TYPE_deleted;
-               set_bkey_val_u64s(&bp->k, 0);
-       }
-
-       ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0);
-err:
-       bch2_trans_iter_exit(trans, &bp_iter);
-       return ret;
-}
-
-static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
-{
-       return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
-               ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
-               : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?:
-                bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
-                                        struct bkey_s_c visiting_k,
-                                        struct bkey_buf *last_flushed)
-{
-       return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
-               ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
-               : 0;
-}
-
-static int backpointer_target_not_found(struct btree_trans *trans,
-                                 struct bkey_s_c_backpointer bp,
-                                 struct bkey_s_c target_k,
-                                 struct bkey_buf *last_flushed,
-                                 bool commit)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       /*
-        * If we're using the btree write buffer, the backpointer we were
-        * looking at may have already been deleted - failure to find what it
-        * pointed to is not an error:
-        */
-       ret = last_flushed
-               ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed)
-               : 0;
-       if (ret)
-               return ret;
-
-       prt_printf(&buf, "backpointer doesn't match %s it points to:\n",
-                  bp.v->level ? "btree node" : "extent");
-       bch2_bkey_val_to_text(&buf, c, bp.s_c);
-
-       prt_newline(&buf);
-       bch2_bkey_val_to_text(&buf, c, target_k);
-
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry)
-               if (p.ptr.dev == bp.k->p.inode) {
-                       prt_newline(&buf);
-                       struct bkey_i_backpointer bp2;
-                       bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2);
-                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i));
-               }
-
-       if (fsck_err(trans, backpointer_to_missing_ptr,
-                    "%s", buf.buf)) {
-               ret = bch2_backpointer_del(trans, bp.k->p);
-               if (ret || !commit)
-                       goto out;
-
-               /*
-                * Normally, on transaction commit from inside a transaction,
-                * we'll return -BCH_ERR_transaction_restart_nested, since a
-                * transaction commit invalidates pointers given out by peek().
-                *
-                * However, since we're updating a write buffer btree, if we
-                * return a transaction restart and loop we won't see that the
-                * backpointer has been deleted without an additional write
-                * buffer flush - and those are expensive.
-                *
-                * So we're relying on the caller immediately advancing to the
-                * next backpointer and starting a new transaction immediately
-                * after backpointer_get_key() returns NULL:
-                */
-               ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-       }
-out:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
-                                                struct bkey_s_c_backpointer bp,
-                                                struct btree_iter *iter,
-                                                struct bkey_buf *last_flushed,
-                                                bool commit)
-{
-       struct bch_fs *c = trans->c;
-
-       BUG_ON(!bp.v->level);
-
-       bch2_trans_node_iter_init(trans, iter,
-                                 bp.v->btree_id,
-                                 bp.v->pos,
-                                 0,
-                                 bp.v->level - 1,
-                                 0);
-       struct btree *b = bch2_btree_iter_peek_node(trans, iter);
-       if (IS_ERR_OR_NULL(b))
-               goto err;
-
-       BUG_ON(b->c.level != bp.v->level - 1);
-
-       if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
-                             bkey_i_to_s_c(&b->key), bp))
-               return b;
-
-       if (btree_node_will_make_reachable(b)) {
-               b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node));
-       } else {
-               int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
-                                                      last_flushed, commit);
-               b = ret ? ERR_PTR(ret) : NULL;
-       }
-err:
-       bch2_trans_iter_exit(trans, iter);
-       return b;
-}
-
-static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
-                                                 struct bkey_s_c_backpointer bp,
-                                                 struct btree_iter *iter,
-                                                 unsigned iter_flags,
-                                                 struct bkey_buf *last_flushed,
-                                                 bool commit)
-{
-       struct bch_fs *c = trans->c;
-
-       if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
-               return bkey_s_c_null;
-
-       bch2_trans_node_iter_init(trans, iter,
-                                 bp.v->btree_id,
-                                 bp.v->pos,
-                                 0,
-                                 bp.v->level,
-                                 iter_flags);
-       struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
-       if (bkey_err(k)) {
-               bch2_trans_iter_exit(trans, iter);
-               return k;
-       }
-
-       /*
-        * peek_slot() doesn't normally return NULL - except when we ask for a
-        * key at a btree level that doesn't exist.
-        *
-        * We may want to revisit this and change peek_slot():
-        */
-       if (!k.k) {
-               bkey_init(&iter->k);
-               iter->k.p = bp.v->pos;
-               k.k = &iter->k;
-       }
-
-       if (k.k &&
-           extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
-               return k;
-
-       bch2_trans_iter_exit(trans, iter);
-
-       if (!bp.v->level) {
-               int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit);
-               return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-       } else {
-               struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
-               if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
-                       return bkey_s_c_null;
-               if (IS_ERR_OR_NULL(b))
-                       return ((struct bkey_s_c) { .k = ERR_CAST(b) });
-
-               return bkey_i_to_s_c(&b->key);
-       }
-}
-
-struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
-                                       struct bkey_s_c_backpointer bp,
-                                       struct btree_iter *iter,
-                                       struct bkey_buf *last_flushed)
-{
-       return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true);
-}
-
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
-                                        struct bkey_s_c_backpointer bp,
-                                        struct btree_iter *iter,
-                                        unsigned iter_flags,
-                                        struct bkey_buf *last_flushed)
-{
-       return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true);
-}
-
-static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
-                                                  struct bkey_buf *last_flushed)
-{
-       if (k.k->type != KEY_TYPE_backpointer)
-               return 0;
-
-       struct bch_fs *c = trans->c;
-       struct btree_iter alloc_iter = {};
-       struct bkey_s_c alloc_k;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       struct bpos bucket;
-       if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
-               ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
-               if (ret)
-                       goto out;
-
-               if (fsck_err(trans, backpointer_to_missing_device,
-                            "backpointer for missing device:\n%s",
-                            (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-                       ret = bch2_backpointer_del(trans, k.k->p);
-               goto out;
-       }
-
-       alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0);
-       ret = bkey_err(alloc_k);
-       if (ret)
-               goto out;
-
-       if (alloc_k.k->type != KEY_TYPE_alloc_v4) {
-               ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
-               if (ret)
-                       goto out;
-
-               if (fsck_err(trans, backpointer_to_missing_alloc,
-                            "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
-                            alloc_iter.pos.inode, alloc_iter.pos.offset,
-                            (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-                       ret = bch2_backpointer_del(trans, k.k->p);
-       }
-out:
-fsck_err:
-       bch2_trans_iter_exit(trans, &alloc_iter);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-/* verify that every backpointer has a corresponding alloc key */
-int bch2_check_btree_backpointers(struct bch_fs *c)
-{
-       struct bkey_buf last_flushed;
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter,
-                       BTREE_ID_backpointers, POS_MIN, 0, k,
-                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                 bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)));
-
-       bch2_bkey_buf_exit(&last_flushed, c);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-struct extents_to_bp_state {
-       struct bpos     bp_start;
-       struct bpos     bp_end;
-       struct bkey_buf last_flushed;
-};
-
-static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
-                              struct bkey_s_c extent, unsigned dev)
-{
-       struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
-       int ret = PTR_ERR_OR_ZERO(n);
-       if (ret)
-               return ret;
-
-       bch2_bkey_drop_device(bkey_i_to_s(n), dev);
-       return bch2_btree_insert_trans(trans, btree, n, 0);
-}
-
-static int check_extent_checksum(struct btree_trans *trans,
-                                enum btree_id btree, struct bkey_s_c extent,
-                                enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       struct printbuf buf = PRINTBUF;
-       void *data_buf = NULL;
-       struct bio *bio = NULL;
-       size_t bytes;
-       int ret = 0;
-
-       if (bkey_is_btree_ptr(extent.k))
-               return false;
-
-       bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
-               if (p.ptr.dev == dev)
-                       goto found;
-       BUG();
-found:
-       if (!p.crc.csum_type)
-               return false;
-
-       bytes = p.crc.compressed_size << 9;
-
-       struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ,
-                               BCH_DEV_READ_REF_check_extent_checksums);
-       if (!ca)
-               return false;
-
-       data_buf = kvmalloc(bytes, GFP_KERNEL);
-       if (!data_buf) {
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
-       bio->bi_iter.bi_sector = p.ptr.offset;
-       bch2_bio_map(bio, data_buf, bytes);
-       ret = submit_bio_wait(bio);
-       if (ret)
-               goto err;
-
-       prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n");
-       bch2_btree_id_to_text(&buf, btree);
-       prt_str(&buf, " ");
-       bch2_bkey_val_to_text(&buf, c, extent);
-       prt_newline(&buf);
-       bch2_btree_id_to_text(&buf, o_btree);
-       prt_str(&buf, " ");
-       bch2_bkey_val_to_text(&buf, c, extent2);
-
-       struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
-       struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
-       if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
-                       trans, dup_backpointer_to_bad_csum_extent,
-                       "%s", buf.buf))
-               ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
-fsck_err:
-err:
-       if (bio)
-               bio_put(bio);
-       kvfree(data_buf);
-       enumerated_ref_put(&ca->io_ref[READ],
-                          BCH_DEV_READ_REF_check_extent_checksums);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int check_bp_exists(struct btree_trans *trans,
-                          struct extents_to_bp_state *s,
-                          struct bkey_i_backpointer *bp,
-                          struct bkey_s_c orig_k)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter other_extent_iter = {};
-       struct printbuf buf = PRINTBUF;
-
-       if (bpos_lt(bp->k.p, s->bp_start) ||
-           bpos_gt(bp->k.p, s->bp_end))
-               return 0;
-
-       struct btree_iter bp_iter;
-       struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0);
-       int ret = bkey_err(bp_k);
-       if (ret)
-               goto err;
-
-       if (bp_k.k->type != KEY_TYPE_backpointer ||
-           memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) {
-               ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed);
-               if (ret)
-                       goto err;
-
-               goto check_existing_bp;
-       }
-out:
-err:
-fsck_err:
-       bch2_trans_iter_exit(trans, &other_extent_iter);
-       bch2_trans_iter_exit(trans, &bp_iter);
-       printbuf_exit(&buf);
-       return ret;
-check_existing_bp:
-       /* Do we have a backpointer for a different extent? */
-       if (bp_k.k->type != KEY_TYPE_backpointer)
-               goto missing;
-
-       struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
-
-       struct bkey_s_c other_extent =
-               __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false);
-       ret = bkey_err(other_extent);
-       if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
-               ret = 0;
-       if (ret)
-               goto err;
-
-       if (!other_extent.k)
-               goto missing;
-
-       rcu_read_lock();
-       struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode);
-       if (ca) {
-               struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent);
-               bkey_for_each_ptr(other_extent_ptrs, ptr)
-                       if (ptr->dev == bp->k.p.inode &&
-                           dev_ptr_stale_rcu(ca, ptr)) {
-                               rcu_read_unlock();
-                               ret = drop_dev_and_update(trans, other_bp.v->btree_id,
-                                                         other_extent, bp->k.p.inode);
-                               if (ret)
-                                       goto err;
-                               goto out;
-                       }
-       }
-       rcu_read_unlock();
-
-       if (bch2_extents_match(orig_k, other_extent)) {
-               printbuf_reset(&buf);
-               prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n");
-               bch2_bkey_val_to_text(&buf, c, orig_k);
-               prt_newline(&buf);
-               bch2_bkey_val_to_text(&buf, c, other_extent);
-               bch_err(c, "%s", buf.buf);
-
-               if (other_extent.k->size <= orig_k.k->size) {
-                       ret = drop_dev_and_update(trans, other_bp.v->btree_id,
-                                                 other_extent, bp->k.p.inode);
-                       if (ret)
-                               goto err;
-                       goto out;
-               } else {
-                       ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode);
-                       if (ret)
-                               goto err;
-                       goto missing;
-               }
-       }
-
-       ret = check_extent_checksum(trans,
-                                   other_bp.v->btree_id, other_extent,
-                                   bp->v.btree_id, orig_k,
-                                   bp->k.p.inode);
-       if (ret < 0)
-               goto err;
-       if (ret) {
-               ret = 0;
-               goto missing;
-       }
-
-       ret = check_extent_checksum(trans, bp->v.btree_id, orig_k,
-                                   other_bp.v->btree_id, other_extent, bp->k.p.inode);
-       if (ret < 0)
-               goto err;
-       if (ret) {
-               ret = 0;
-               goto out;
-       }
-
-       printbuf_reset(&buf);
-       prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode);
-       bch2_bkey_val_to_text(&buf, c, orig_k);
-       prt_newline(&buf);
-       bch2_bkey_val_to_text(&buf, c, other_extent);
-       bch_err(c, "%s", buf.buf);
-       ret = bch_err_throw(c, fsck_repair_unimplemented);
-       goto err;
-missing:
-       printbuf_reset(&buf);
-       prt_str(&buf, "missing backpointer\nfor:  ");
-       bch2_bkey_val_to_text(&buf, c, orig_k);
-       prt_printf(&buf, "\nwant: ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
-       prt_printf(&buf, "\ngot:  ");
-       bch2_bkey_val_to_text(&buf, c, bp_k);
-
-       if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
-               ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true);
-
-       goto out;
-}
-
-static int check_extent_to_backpointers(struct btree_trans *trans,
-                                       struct extents_to_bp_state *s,
-                                       enum btree_id btree, unsigned level,
-                                       struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
-                       continue;
-
-               bool empty;
-               {
-                       /* scoped_guard() is a loop, so it breaks continue */
-                       guard(rcu)();
-                       struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
-                       if (!ca)
-                               continue;
-
-                       if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr))
-                               continue;
-
-                       u64 b = PTR_BUCKET_NR(ca, &p.ptr);
-                       if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b))
-                               continue;
-
-                       empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b);
-               }
-
-               struct bkey_i_backpointer bp;
-               bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
-
-               int ret = !empty
-                       ? check_bp_exists(trans, s, &bp, k)
-                       : bch2_bucket_backpointer_mod(trans, k, &bp, true);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static int check_btree_root_to_backpointers(struct btree_trans *trans,
-                                           struct extents_to_bp_state *s,
-                                           enum btree_id btree_id,
-                                           int *level)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct btree *b;
-       struct bkey_s_c k;
-       int ret;
-retry:
-       bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
-                                 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
-       b = bch2_btree_iter_peek_node(trans, &iter);
-       ret = PTR_ERR_OR_ZERO(b);
-       if (ret)
-               goto err;
-
-       if (b != btree_node_root(c, b)) {
-               bch2_trans_iter_exit(trans, &iter);
-               goto retry;
-       }
-
-       *level = b->c.level;
-
-       k = bkey_i_to_s_c(&b->key);
-       ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static u64 mem_may_pin_bytes(struct bch_fs *c)
-{
-       struct sysinfo i;
-       si_meminfo(&i);
-
-       u64 mem_bytes = i.totalram * i.mem_unit;
-       return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
-}
-
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
-{
-       return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
-}
-
-static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
-                                       u64 btree_leaf_mask,
-                                       u64 btree_interior_mask,
-                                       struct bbpos start, struct bbpos *end)
-{
-       struct bch_fs *c = trans->c;
-       s64 mem_may_pin = mem_may_pin_bytes(c);
-       int ret = 0;
-
-       bch2_btree_cache_unpin(c);
-
-       btree_interior_mask |= btree_leaf_mask;
-
-       c->btree_cache.pinned_nodes_mask[0]             = btree_leaf_mask;
-       c->btree_cache.pinned_nodes_mask[1]             = btree_interior_mask;
-       c->btree_cache.pinned_nodes_start               = start;
-       c->btree_cache.pinned_nodes_end                 = *end = BBPOS_MAX;
-
-       for (enum btree_id btree = start.btree;
-            btree < BTREE_ID_NR && !ret;
-            btree++) {
-               unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
-
-               if (!(BIT_ULL(btree) & btree_leaf_mask) &&
-                   !(BIT_ULL(btree) & btree_interior_mask))
-                       continue;
-
-               ret = __for_each_btree_node(trans, iter, btree,
-                                     btree == start.btree ? start.pos : POS_MIN,
-                                     0, depth, BTREE_ITER_prefetch, b, ({
-                       mem_may_pin -= btree_buf_bytes(b);
-                       if (mem_may_pin <= 0) {
-                               c->btree_cache.pinned_nodes_end = *end =
-                                       BBPOS(btree, b->key.k.p);
-                               break;
-                       }
-                       bch2_node_pin(c, b);
-                       0;
-               }));
-       }
-
-       return ret;
-}
-
-static inline int bch2_fs_going_ro(struct bch_fs *c)
-{
-       return test_bit(BCH_FS_going_ro, &c->flags)
-               ? -EROFS
-               : 0;
-}
-
-static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
-                                                  struct extents_to_bp_state *s)
-{
-       struct bch_fs *c = trans->c;
-       struct progress_indicator_state progress;
-       int ret = 0;
-
-       bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
-
-       for (enum btree_id btree_id = 0;
-            btree_id < btree_id_nr_alive(c);
-            btree_id++) {
-               int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
-
-               ret = commit_do(trans, NULL, NULL,
-                               BCH_TRANS_COMMIT_no_enospc,
-                               check_btree_root_to_backpointers(trans, s, btree_id, &level));
-               if (ret)
-                       return ret;
-
-               while (level >= depth) {
-                       struct btree_iter iter;
-                       bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level,
-                                                 BTREE_ITER_prefetch);
-
-                       ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-                               bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
-                               bch2_fs_going_ro(c) ?:
-                               check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
-                               bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-                       }));
-                       if (ret)
-                               return ret;
-
-                       --level;
-               }
-       }
-
-       return 0;
-}
-
-enum alloc_sector_counter {
-       ALLOC_dirty,
-       ALLOC_cached,
-       ALLOC_stripe,
-       ALLOC_SECTORS_NR
-};
-
-static int data_type_to_alloc_counter(enum bch_data_type t)
-{
-       switch (t) {
-       case BCH_DATA_btree:
-       case BCH_DATA_user:
-               return ALLOC_dirty;
-       case BCH_DATA_cached:
-               return ALLOC_cached;
-       case BCH_DATA_stripe:
-       case BCH_DATA_parity:
-               return ALLOC_stripe;
-       default:
-               return -1;
-       }
-}
-
-static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
-
-static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
-                                            bool *had_mismatch,
-                                            struct bkey_buf *last_flushed)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_alloc_v4 a_convert;
-       const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
-       bool need_commit = false;
-
-       *had_mismatch = false;
-
-       if (a->data_type == BCH_DATA_sb ||
-           a->data_type == BCH_DATA_journal ||
-           a->data_type == BCH_DATA_parity)
-               return 0;
-
-       u32 sectors[ALLOC_SECTORS_NR];
-       memset(sectors, 0, sizeof(sectors));
-
-       struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p);
-       if (!ca)
-               return 0;
-
-       struct btree_iter iter;
-       struct bkey_s_c bp_k;
-       int ret = 0;
-       for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers,
-                               bucket_pos_to_bp_start(ca, alloc_k.k->p),
-                               bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) {
-               if (bp_k.k->type != KEY_TYPE_backpointer)
-                       continue;
-
-               struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
-
-               if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
-                   (bp.v->bucket_gen != a->gen ||
-                    bp.v->pad)) {
-                       ret = bch2_backpointer_del(trans, bp_k.k->p);
-                       if (ret)
-                               break;
-
-                       need_commit = true;
-                       continue;
-               }
-
-               if (bp.v->bucket_gen != a->gen)
-                       continue;
-
-               int alloc_counter = data_type_to_alloc_counter(bp.v->data_type);
-               if (alloc_counter < 0)
-                       continue;
-
-               sectors[alloc_counter] += bp.v->bucket_len;
-       };
-       bch2_trans_iter_exit(trans, &iter);
-       if (ret)
-               goto err;
-
-       if (need_commit) {
-               ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-               if (ret)
-                       goto err;
-       }
-
-       if (sectors[ALLOC_dirty]  != a->dirty_sectors ||
-           sectors[ALLOC_cached] != a->cached_sectors ||
-           sectors[ALLOC_stripe] != a->stripe_sectors) {
-               if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
-                       ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
-                       if (ret)
-                               goto err;
-               }
-
-               if (sectors[ALLOC_dirty]  > a->dirty_sectors ||
-                   sectors[ALLOC_cached] > a->cached_sectors ||
-                   sectors[ALLOC_stripe] > a->stripe_sectors) {
-                       ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
-                               bch_err_throw(c, transaction_restart_nested);
-                       goto err;
-               }
-
-               bool empty = (sectors[ALLOC_dirty] +
-                             sectors[ALLOC_stripe] +
-                             sectors[ALLOC_cached]) == 0;
-
-               ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch,
-                                            alloc_k.k->p.offset) ?:
-                       (empty
-                        ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty,
-                                                 alloc_k.k->p.offset)
-                        : 0);
-
-               *had_mismatch = true;
-       }
-err:
-       bch2_dev_put(ca);
-       return ret;
-}
-
-static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_btree_ptr_v2: {
-               bool ret = false;
-
-               guard(rcu)();
-               struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
-               while (pos.inode <= k.k->p.inode) {
-                       if (pos.inode >= c->sb.nr_devices)
-                               break;
-
-                       struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
-                       if (!ca)
-                               goto next;
-
-                       struct bpos bucket = bp_pos_to_bucket(ca, pos);
-                       u64 next = ca->mi.nbuckets;
-
-                       unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
-                       if (bitmap)
-                               next = min_t(u64, next,
-                                            find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset));
-
-                       bucket.offset = next;
-                       if (bucket.offset == ca->mi.nbuckets)
-                               goto next;
-
-                       ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p);
-                       if (ret)
-                               break;
-next:
-                       pos = SPOS(pos.inode + 1, 0, 0);
-               }
-
-               return ret;
-       }
-       case KEY_TYPE_btree_ptr:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
-                                 enum btree_id btree, unsigned level)
-{
-       struct btree_iter iter;
-       bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
-       struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
-       int ret = PTR_ERR_OR_ZERO(b);
-       if (ret)
-               goto err;
-
-       if (b)
-               bch2_node_pin(trans->c, b);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans,
-                                                  struct bpos start, struct bpos *end)
-{
-       struct bch_fs *c = trans->c;
-       int ret = 0;
-
-       struct bkey_buf tmp;
-       bch2_bkey_buf_init(&tmp);
-
-       bch2_btree_cache_unpin(c);
-
-       *end = SPOS_MAX;
-
-       s64 mem_may_pin = mem_may_pin_bytes(c);
-       struct btree_iter iter;
-       bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
-                                 0, 1, BTREE_ITER_prefetch);
-       ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-               if (!backpointer_node_has_missing(c, k))
-                       continue;
-
-               mem_may_pin -= c->opts.btree_node_size;
-               if (mem_may_pin <= 0)
-                       break;
-
-               bch2_bkey_buf_reassemble(&tmp, c, k);
-               struct btree_path *path = btree_iter_path(trans, &iter);
-
-               BUG_ON(path->level != 1);
-
-               bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1);
-       }));
-       if (ret)
-               return ret;
-
-       struct bpos pinned = SPOS_MAX;
-       mem_may_pin = mem_may_pin_bytes(c);
-       bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
-                                 0, 1, BTREE_ITER_prefetch);
-       ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-               if (!backpointer_node_has_missing(c, k))
-                       continue;
-
-               mem_may_pin -= c->opts.btree_node_size;
-               if (mem_may_pin <= 0) {
-                       *end = pinned;
-                       break;
-               }
-
-               bch2_bkey_buf_reassemble(&tmp, c, k);
-               struct btree_path *path = btree_iter_path(trans, &iter);
-
-               BUG_ON(path->level != 1);
-
-               int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1);
-
-               if (!ret2)
-                       pinned = tmp.k->k.p;
-
-               ret;
-       }));
-       if (ret)
-               return ret;
-
-       return ret;
-}
-
-int bch2_check_extents_to_backpointers(struct bch_fs *c)
-{
-       int ret = 0;
-
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct extents_to_bp_state s = { .bp_start = POS_MIN };
-
-       bch2_bkey_buf_init(&s.last_flushed);
-       bkey_init(&s.last_flushed.k->k);
-
-       ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
-                                POS_MIN, BTREE_ITER_prefetch, k, ({
-               bool had_mismatch;
-               bch2_fs_going_ro(c) ?:
-               check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed);
-       }));
-       if (ret)
-               goto err;
-
-       u64 nr_buckets = 0, nr_mismatches = 0;
-       for_each_member_device(c, ca) {
-               nr_buckets      += ca->mi.nbuckets;
-               nr_mismatches   += ca->bucket_backpointer_mismatch.nr;
-       }
-
-       if (!nr_mismatches)
-               goto err;
-
-       bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
-                nr_mismatches, nr_buckets);
-
-       while (1) {
-               ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
-               if (ret)
-                       break;
-
-               if ( bpos_eq(s.bp_start, POS_MIN) &&
-                   !bpos_eq(s.bp_end, SPOS_MAX))
-                       bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
-                                   __func__, btree_nodes_fit_in_ram(c));
-
-               if (!bpos_eq(s.bp_start, POS_MIN) ||
-                   !bpos_eq(s.bp_end, SPOS_MAX)) {
-                       struct printbuf buf = PRINTBUF;
-
-                       prt_str(&buf, "check_extents_to_backpointers(): ");
-                       bch2_bpos_to_text(&buf, s.bp_start);
-                       prt_str(&buf, "-");
-                       bch2_bpos_to_text(&buf, s.bp_end);
-
-                       bch_verbose(c, "%s", buf.buf);
-                       printbuf_exit(&buf);
-               }
-
-               ret = bch2_check_extents_to_backpointers_pass(trans, &s);
-               if (ret || bpos_eq(s.bp_end, SPOS_MAX))
-                       break;
-
-               s.bp_start = bpos_successor(s.bp_end);
-       }
-
-       for_each_member_device(c, ca) {
-               bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
-               bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
-       }
-err:
-       bch2_trans_put(trans);
-       bch2_bkey_buf_exit(&s.last_flushed, c);
-       bch2_btree_cache_unpin(c);
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans,
-                                                struct bpos bucket,
-                                                bool *had_mismatch,
-                                                struct bkey_buf *last_flushed)
-{
-       struct btree_iter alloc_iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter,
-                                              BTREE_ID_alloc, bucket,
-                                              BTREE_ITER_cached);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed);
-       bch2_trans_iter_exit(trans, &alloc_iter);
-       return ret;
-}
-
-int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans,
-                                          struct bch_dev *ca, u64 bucket,
-                                          bool copygc,
-                                          struct bkey_buf *last_flushed)
-{
-       struct bch_fs *c = trans->c;
-       bool had_mismatch;
-       int ret = lockrestart_do(trans,
-               check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket),
-                                                     &had_mismatch, last_flushed));
-       if (ret || !had_mismatch)
-               return ret;
-
-       u64 nr = ca->bucket_backpointer_mismatch.nr;
-       u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0;
-
-       struct printbuf buf = PRINTBUF;
-       __bch2_log_msg_start(ca->name, &buf);
-
-       prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n",
-                  bucket, nr, ca->mi.nbuckets);
-
-       bch2_run_explicit_recovery_pass(c, &buf,
-                       BCH_RECOVERY_PASS_check_extents_to_backpointers,
-                       nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0);
-
-       bch2_print_str(c, KERN_ERR, buf.buf);
-       printbuf_exit(&buf);
-       return 0;
-}
-
-/* backpointers -> extents */
-
-static int check_one_backpointer(struct btree_trans *trans,
-                                struct bbpos start,
-                                struct bbpos end,
-                                struct bkey_s_c bp_k,
-                                struct bkey_buf *last_flushed)
-{
-       if (bp_k.k->type != KEY_TYPE_backpointer)
-               return 0;
-
-       struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
-       struct bbpos pos = bp_to_bbpos(*bp.v);
-
-       if (bbpos_cmp(pos, start) < 0 ||
-           bbpos_cmp(pos, end) > 0)
-               return 0;
-
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed);
-       int ret = bkey_err(k);
-       if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
-               return 0;
-       if (ret)
-               return ret;
-
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int check_bucket_backpointers_to_extents(struct btree_trans *trans,
-                                               struct bch_dev *ca, struct bpos bucket)
-{
-       u32 restart_count = trans->restart_count;
-       struct bkey_buf last_flushed;
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-
-       int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
-                                     bucket_pos_to_bp_start(ca, bucket),
-                                     bucket_pos_to_bp_end(ca, bucket),
-                                     0, k,
-               check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed)
-       );
-
-       bch2_bkey_buf_exit(&last_flushed, trans->c);
-       return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
-                                                  struct bbpos start,
-                                                  struct bbpos end)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_buf last_flushed;
-       struct progress_indicator_state progress;
-
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-       bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
-
-       int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
-                                    POS_MIN, BTREE_ITER_prefetch, k, ({
-                       bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
-                       check_one_backpointer(trans, start, end, k, &last_flushed);
-       }));
-
-       bch2_bkey_buf_exit(&last_flushed, c);
-       return ret;
-}
-
-int bch2_check_backpointers_to_extents(struct bch_fs *c)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
-       int ret;
-
-       while (1) {
-               ret = bch2_get_btree_in_memory_pos(trans,
-                                                  BIT_ULL(BTREE_ID_extents)|
-                                                  BIT_ULL(BTREE_ID_reflink),
-                                                  ~0,
-                                                  start, &end);
-               if (ret)
-                       break;
-
-               if (!bbpos_cmp(start, BBPOS_MIN) &&
-                   bbpos_cmp(end, BBPOS_MAX))
-                       bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
-                                   __func__, btree_nodes_fit_in_ram(c));
-
-               if (bbpos_cmp(start, BBPOS_MIN) ||
-                   bbpos_cmp(end, BBPOS_MAX)) {
-                       struct printbuf buf = PRINTBUF;
-
-                       prt_str(&buf, "check_backpointers_to_extents(): ");
-                       bch2_bbpos_to_text(&buf, start);
-                       prt_str(&buf, "-");
-                       bch2_bbpos_to_text(&buf, end);
-
-                       bch_verbose(c, "%s", buf.buf);
-                       printbuf_exit(&buf);
-               }
-
-               ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
-               if (ret || !bbpos_cmp(end, BBPOS_MAX))
-                       break;
-
-               start = bbpos_successor(end);
-       }
-       bch2_trans_put(trans);
-
-       bch2_btree_cache_unpin(c);
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit)
-{
-       scoped_guard(mutex, &b->lock) {
-               if (!b->buckets) {
-                       b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
-                                             sizeof(unsigned long), GFP_KERNEL);
-                       if (!b->buckets)
-                               return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
-               }
-
-               b->nr += !__test_and_set_bit(bit, b->buckets);
-       }
-
-       return 0;
-}
-
-int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b,
-                             u64 old_size, u64 new_size)
-{
-       scoped_guard(mutex, &b->lock) {
-               if (!b->buckets)
-                       return 0;
-
-               unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size),
-                                           sizeof(unsigned long), GFP_KERNEL);
-               if (!n)
-                       return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
-
-               memcpy(n, b->buckets,
-                      BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long));
-               kvfree(b->buckets);
-               b->buckets = n;
-       }
-
-       return 0;
-}
-
-void bch2_bucket_bitmap_free(struct bucket_bitmap *b)
-{
-       mutex_lock(&b->lock);
-       kvfree(b->buckets);
-       b->buckets = NULL;
-       b->nr   = 0;
-       mutex_unlock(&b->lock);
-}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
deleted file mode 100644 (file)
index 7e71afe..0000000
+++ /dev/null
@@ -1,200 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BACKPOINTERS_H
-#define _BCACHEFS_BACKPOINTERS_H
-
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "error.h"
-#include "super.h"
-
-static inline u64 swab40(u64 x)
-{
-       return (((x & 0x00000000ffULL) << 32)|
-               ((x & 0x000000ff00ULL) << 16)|
-               ((x & 0x0000ff0000ULL) >>  0)|
-               ((x & 0x00ff000000ULL) >> 16)|
-               ((x & 0xff00000000ULL) >> 32));
-}
-
-int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k,
-                             struct bkey_validate_context);
-void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_backpointer_swab(struct bkey_s);
-
-#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
-       .key_validate   = bch2_backpointer_validate,    \
-       .val_to_text    = bch2_backpointer_to_text,     \
-       .swab           = bch2_backpointer_swab,        \
-       .min_val_size   = 32,                           \
-})
-
-#define MAX_EXTENT_COMPRESS_RATIO_SHIFT                10
-
-/*
- * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
- * btree:
- */
-static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos)
-{
-       u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
-       return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
-}
-
-static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos,
-                                                     u32 *bucket_offset)
-{
-       u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
-       return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset));
-}
-
-static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
-{
-       guard(rcu)();
-       struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
-       if (ca)
-               *bucket = bp_pos_to_bucket(ca, bp_pos);
-       return ca != NULL;
-}
-
-static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
-                                                  struct bpos bucket,
-                                                  u64 bucket_offset)
-{
-       return POS(bucket.inode,
-                  (bucket_to_sector(ca, bucket.offset) <<
-                   MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
-}
-
-/*
- * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
- */
-static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
-                                          struct bpos bucket,
-                                          u64 bucket_offset)
-{
-       struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
-       EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret)));
-       return ret;
-}
-
-static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket)
-{
-       return bucket_pos_to_bp(ca, bucket, 0);
-}
-
-static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket)
-{
-       return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0));
-}
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
-                               struct bkey_s_c,
-                               struct bkey_i_backpointer *,
-                               bool);
-
-static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
-                               struct bkey_s_c orig_k,
-                               struct bkey_i_backpointer *bp,
-                               bool insert)
-{
-       if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer))
-               return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
-
-       if (!insert) {
-               bp->k.type = KEY_TYPE_deleted;
-               set_bkey_val_u64s(&bp->k, 0);
-       }
-
-       return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i);
-}
-
-static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
-                                                        struct extent_ptr_decoded p,
-                                                        const union bch_extent_entry *entry)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_btree_ptr:
-       case KEY_TYPE_btree_ptr_v2:
-               return BCH_DATA_btree;
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reflink_v:
-               if (p.has_ec)
-                       return BCH_DATA_stripe;
-               if (p.ptr.cached)
-                       return BCH_DATA_cached;
-               else
-                       return BCH_DATA_user;
-       case KEY_TYPE_stripe: {
-               const struct bch_extent_ptr *ptr = &entry->ptr;
-               struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-               BUG_ON(ptr < s.v->ptrs ||
-                      ptr >= s.v->ptrs + s.v->nr_blocks);
-
-               return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
-                       ? BCH_DATA_parity
-                       : BCH_DATA_user;
-       }
-       default:
-               BUG();
-       }
-}
-
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
-                          enum btree_id btree_id, unsigned level,
-                          struct bkey_s_c k, struct extent_ptr_decoded p,
-                          const union bch_extent_entry *entry,
-                          struct bkey_i_backpointer *bp)
-{
-       bkey_backpointer_init(&bp->k_i);
-       bp->k.p.inode = p.ptr.dev;
-
-       if (k.k->type != KEY_TYPE_stripe)
-               bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
-       else {
-               /*
-                * Put stripe backpointers where they won't collide with the
-                * extent backpointers within the stripe:
-                */
-               struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-               bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) <<
-                                 MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1;
-       }
-
-       bp->v   = (struct bch_backpointer) {
-               .btree_id       = btree_id,
-               .level          = level,
-               .data_type      = bch2_bkey_ptr_data_type(k, p, entry),
-               .bucket_gen     = p.ptr.gen,
-               .bucket_len     = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p),
-               .pos            = k.k->p,
-       };
-}
-
-struct bkey_buf;
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
-                                        struct btree_iter *, unsigned, struct bkey_buf *);
-struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
-                                       struct btree_iter *, struct bkey_buf *);
-
-int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64,
-                                          bool, struct bkey_buf *);
-
-int bch2_check_btree_backpointers(struct bch_fs *);
-int bch2_check_extents_to_backpointers(struct bch_fs *);
-int bch2_check_backpointers_to_extents(struct bch_fs *);
-
-static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i)
-{
-       unsigned long *bitmap = READ_ONCE(b->buckets);
-       return bitmap && test_bit(i, bitmap);
-}
-
-int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64);
-void bch2_bucket_bitmap_free(struct bucket_bitmap *);
-
-#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
deleted file mode 100644 (file)
index 63abe17..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BBPOS_H
-#define _BCACHEFS_BBPOS_H
-
-#include "bbpos_types.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-
-static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
-{
-       return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
-}
-
-static inline struct bbpos bbpos_successor(struct bbpos pos)
-{
-       if (bpos_cmp(pos.pos, SPOS_MAX)) {
-               pos.pos = bpos_successor(pos.pos);
-               return pos;
-       }
-
-       if (pos.btree != BTREE_ID_NR) {
-               pos.btree++;
-               pos.pos = POS_MIN;
-               return pos;
-       }
-
-       BUG();
-}
-
-static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
-{
-       bch2_btree_id_to_text(out, pos.btree);
-       prt_char(out, ':');
-       bch2_bpos_to_text(out, pos.pos);
-}
-
-#endif /* _BCACHEFS_BBPOS_H */
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
deleted file mode 100644 (file)
index f638933..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BBPOS_TYPES_H
-#define _BCACHEFS_BBPOS_TYPES_H
-
-struct bbpos {
-       enum btree_id           btree;
-       struct bpos             pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
-       return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN      BBPOS(0, POS_MIN)
-#define BBPOS_MAX      BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
-
-#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
deleted file mode 100644 (file)
index ddfacad..0000000
+++ /dev/null
@@ -1,1295 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_H
-#define _BCACHEFS_H
-
-/*
- * SOME HIGH LEVEL CODE DOCUMENTATION:
- *
- * Bcache mostly works with cache sets, cache devices, and backing devices.
- *
- * Support for multiple cache devices hasn't quite been finished off yet, but
- * it's about 95% plumbed through. A cache set and its cache devices is sort of
- * like a md raid array and its component devices. Most of the code doesn't care
- * about individual cache devices, the main abstraction is the cache set.
- *
- * Multiple cache devices is intended to give us the ability to mirror dirty
- * cached data and metadata, without mirroring clean cached data.
- *
- * Backing devices are different, in that they have a lifetime independent of a
- * cache set. When you register a newly formatted backing device it'll come up
- * in passthrough mode, and then you can attach and detach a backing device from
- * a cache set at runtime - while it's mounted and in use. Detaching implicitly
- * invalidates any cached data for that backing device.
- *
- * A cache set can have multiple (many) backing devices attached to it.
- *
- * There's also flash only volumes - this is the reason for the distinction
- * between struct cached_dev and struct bcache_device. A flash only volume
- * works much like a bcache device that has a backing device, except the
- * "cached" data is always dirty. The end result is that we get thin
- * provisioning with very little additional code.
- *
- * Flash only volumes work but they're not production ready because the moving
- * garbage collector needs more work. More on that later.
- *
- * BUCKETS/ALLOCATION:
- *
- * Bcache is primarily designed for caching, which means that in normal
- * operation all of our available space will be allocated. Thus, we need an
- * efficient way of deleting things from the cache so we can write new things to
- * it.
- *
- * To do this, we first divide the cache device up into buckets. A bucket is the
- * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
- * works efficiently.
- *
- * Each bucket has a 16 bit priority, and an 8 bit generation associated with
- * it. The gens and priorities for all the buckets are stored contiguously and
- * packed on disk (in a linked list of buckets - aside from the superblock, all
- * of bcache's metadata is stored in buckets).
- *
- * The priority is used to implement an LRU. We reset a bucket's priority when
- * we allocate it or on cache it, and every so often we decrement the priority
- * of each bucket. It could be used to implement something more sophisticated,
- * if anyone ever gets around to it.
- *
- * The generation is used for invalidating buckets. Each pointer also has an 8
- * bit generation embedded in it; for a pointer to be considered valid, its gen
- * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
- * we have to do is increment its gen (and write its new gen to disk; we batch
- * this up).
- *
- * Bcache is entirely COW - we never write twice to a bucket, even buckets that
- * contain metadata (including btree nodes).
- *
- * THE BTREE:
- *
- * Bcache is in large part design around the btree.
- *
- * At a high level, the btree is just an index of key -> ptr tuples.
- *
- * Keys represent extents, and thus have a size field. Keys also have a variable
- * number of pointers attached to them (potentially zero, which is handy for
- * invalidating the cache).
- *
- * The key itself is an inode:offset pair. The inode number corresponds to a
- * backing device or a flash only volume. The offset is the ending offset of the
- * extent within the inode - not the starting offset; this makes lookups
- * slightly more convenient.
- *
- * Pointers contain the cache device id, the offset on that device, and an 8 bit
- * generation number. More on the gen later.
- *
- * Index lookups are not fully abstracted - cache lookups in particular are
- * still somewhat mixed in with the btree code, but things are headed in that
- * direction.
- *
- * Updates are fairly well abstracted, though. There are two different ways of
- * updating the btree; insert and replace.
- *
- * BTREE_INSERT will just take a list of keys and insert them into the btree -
- * overwriting (possibly only partially) any extents they overlap with. This is
- * used to update the index after a write.
- *
- * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
- * overwriting a key that matches another given key. This is used for inserting
- * data into the cache after a cache miss, and for background writeback, and for
- * the moving garbage collector.
- *
- * There is no "delete" operation; deleting things from the index is
- * accomplished by either by invalidating pointers (by incrementing a bucket's
- * gen) or by inserting a key with 0 pointers - which will overwrite anything
- * previously present at that location in the index.
- *
- * This means that there are always stale/invalid keys in the btree. They're
- * filtered out by the code that iterates through a btree node, and removed when
- * a btree node is rewritten.
- *
- * BTREE NODES:
- *
- * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
- * free smaller than a bucket - so, that's how big our btree nodes are.
- *
- * (If buckets are really big we'll only use part of the bucket for a btree node
- * - no less than 1/4th - but a bucket still contains no more than a single
- * btree node. I'd actually like to change this, but for now we rely on the
- * bucket's gen for deleting btree nodes when we rewrite/split a node.)
- *
- * Anyways, btree nodes are big - big enough to be inefficient with a textbook
- * btree implementation.
- *
- * The way this is solved is that btree nodes are internally log structured; we
- * can append new keys to an existing btree node without rewriting it. This
- * means each set of keys we write is sorted, but the node is not.
- *
- * We maintain this log structure in memory - keeping 1Mb of keys sorted would
- * be expensive, and we have to distinguish between the keys we have written and
- * the keys we haven't. So to do a lookup in a btree node, we have to search
- * each sorted set. But we do merge written sets together lazily, so the cost of
- * these extra searches is quite low (normally most of the keys in a btree node
- * will be in one big set, and then there'll be one or two sets that are much
- * smaller).
- *
- * This log structure makes bcache's btree more of a hybrid between a
- * conventional btree and a compacting data structure, with some of the
- * advantages of both.
- *
- * GARBAGE COLLECTION:
- *
- * We can't just invalidate any bucket - it might contain dirty data or
- * metadata. If it once contained dirty data, other writes might overwrite it
- * later, leaving no valid pointers into that bucket in the index.
- *
- * Thus, the primary purpose of garbage collection is to find buckets to reuse.
- * It also counts how much valid data it each bucket currently contains, so that
- * allocation can reuse buckets sooner when they've been mostly overwritten.
- *
- * It also does some things that are really internal to the btree
- * implementation. If a btree node contains pointers that are stale by more than
- * some threshold, it rewrites the btree node to avoid the bucket's generation
- * wrapping around. It also merges adjacent btree nodes if they're empty enough.
- *
- * THE JOURNAL:
- *
- * Bcache's journal is not necessary for consistency; we always strictly
- * order metadata writes so that the btree and everything else is consistent on
- * disk in the event of an unclean shutdown, and in fact bcache had writeback
- * caching (with recovery from unclean shutdown) before journalling was
- * implemented.
- *
- * Rather, the journal is purely a performance optimization; we can't complete a
- * write until we've updated the index on disk, otherwise the cache would be
- * inconsistent in the event of an unclean shutdown. This means that without the
- * journal, on random write workloads we constantly have to update all the leaf
- * nodes in the btree, and those writes will be mostly empty (appending at most
- * a few keys each) - highly inefficient in terms of amount of metadata writes,
- * and it puts more strain on the various btree resorting/compacting code.
- *
- * The journal is just a log of keys we've inserted; on startup we just reinsert
- * all the keys in the open journal entries. That means that when we're updating
- * a node in the btree, we can wait until a 4k block of keys fills up before
- * writing them out.
- *
- * For simplicity, we only journal updates to leaf nodes; updates to parent
- * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
- * the complexity to deal with journalling them (in particular, journal replay)
- * - updates to non leaf nodes just happen synchronously (see btree_split()).
- */
-
-#undef pr_fmt
-#ifdef __KERNEL__
-#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
-#else
-#define pr_fmt(fmt) "%s() " fmt "\n", __func__
-#endif
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define ENUMERATED_REF_DEBUG
-#endif
-
-#ifndef dynamic_fault
-#define dynamic_fault(...)             0
-#endif
-
-#define race_fault(...)                        dynamic_fault("bcachefs:race")
-
-#include <linux/backing-dev-defs.h>
-#include <linux/bug.h>
-#include <linux/bio.h>
-#include <linux/closure.h>
-#include <linux/kobject.h>
-#include <linux/list.h>
-#include <linux/math64.h>
-#include <linux/mutex.h>
-#include <linux/percpu-refcount.h>
-#include <linux/percpu-rwsem.h>
-#include <linux/refcount.h>
-#include <linux/rhashtable.h>
-#include <linux/rwsem.h>
-#include <linux/semaphore.h>
-#include <linux/seqlock.h>
-#include <linux/shrinker.h>
-#include <linux/srcu.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-#include <linux/zstd.h>
-#include <linux/unicode.h>
-
-#include "bcachefs_format.h"
-#include "btree_journal_iter_types.h"
-#include "disk_accounting_types.h"
-#include "errcode.h"
-#include "fast_list.h"
-#include "fifo.h"
-#include "nocow_locking_types.h"
-#include "opts.h"
-#include "sb-errors_types.h"
-#include "seqmutex.h"
-#include "snapshot_types.h"
-#include "time_stats.h"
-#include "util.h"
-
-#include "alloc_types.h"
-#include "async_objs_types.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-#include "btree_node_scan_types.h"
-#include "btree_write_buffer_types.h"
-#include "buckets_types.h"
-#include "buckets_waiting_for_journal_types.h"
-#include "clock_types.h"
-#include "disk_groups_types.h"
-#include "ec_types.h"
-#include "enumerated_ref_types.h"
-#include "journal_types.h"
-#include "keylist_types.h"
-#include "quota_types.h"
-#include "rebalance_types.h"
-#include "recovery_passes_types.h"
-#include "replicas_types.h"
-#include "sb-members_types.h"
-#include "subvolume_types.h"
-#include "super_types.h"
-#include "thread_with_file_types.h"
-
-#include "trace.h"
-
-#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
-
-#define trace_and_count(_c, _name, ...)                                        \
-do {                                                                   \
-       count_event(_c, _name);                                         \
-       trace_##_name(__VA_ARGS__);                                     \
-} while (0)
-
-#define bch2_fs_init_fault(name)                                       \
-       dynamic_fault("bcachefs:bch_fs_init:" name)
-#define bch2_meta_read_fault(name)                                     \
-        dynamic_fault("bcachefs:meta:read:" name)
-#define bch2_meta_write_fault(name)                                    \
-        dynamic_fault("bcachefs:meta:write:" name)
-
-#ifdef __KERNEL__
-#define BCACHEFS_LOG_PREFIX
-#endif
-
-#ifdef BCACHEFS_LOG_PREFIX
-
-#define bch2_log_msg(_c, fmt)                  "bcachefs (%s): " fmt, ((_c)->name)
-#define bch2_fmt_dev(_ca, fmt)                 "bcachefs (%s): " fmt "\n", ((_ca)->name)
-#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
-#define bch2_fmt_inum(_c, _inum, fmt)          "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
-#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)                  \
-        "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
-
-#else
-
-#define bch2_log_msg(_c, fmt)                  fmt
-#define bch2_fmt_dev(_ca, fmt)                 "%s: " fmt "\n", ((_ca)->name)
-#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
-#define bch2_fmt_inum(_c, _inum, fmt)          "inum %llu: " fmt "\n", (_inum)
-#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)                          \
-        "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
-
-#endif
-
-#define bch2_fmt(_c, fmt)              bch2_log_msg(_c, fmt "\n")
-
-void bch2_print_str(struct bch_fs *, const char *, const char *);
-
-__printf(2, 3)
-void bch2_print_opts(struct bch_opts *, const char *, ...);
-
-__printf(2, 3)
-void __bch2_print(struct bch_fs *c, const char *fmt, ...);
-
-#define maybe_dev_to_fs(_c)    _Generic((_c),                          \
-       struct bch_dev *:       ((struct bch_dev *) (_c))->fs,          \
-       struct bch_fs *:        (_c))
-
-#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
-
-#define bch2_print_ratelimited(_c, ...)                                        \
-do {                                                                   \
-       static DEFINE_RATELIMIT_STATE(_rs,                              \
-                                     DEFAULT_RATELIMIT_INTERVAL,       \
-                                     DEFAULT_RATELIMIT_BURST);         \
-                                                                       \
-       if (__ratelimit(&_rs))                                          \
-               bch2_print(_c, __VA_ARGS__);                            \
-} while (0)
-
-#define bch2_print_str_ratelimited(_c, ...)                            \
-do {                                                                   \
-       static DEFINE_RATELIMIT_STATE(_rs,                              \
-                                     DEFAULT_RATELIMIT_INTERVAL,       \
-                                     DEFAULT_RATELIMIT_BURST);         \
-                                                                       \
-       if (__ratelimit(&_rs))                                          \
-               bch2_print_str(_c, __VA_ARGS__);                        \
-} while (0)
-
-#define bch_info(c, fmt, ...) \
-       bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_info_ratelimited(c, fmt, ...) \
-       bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_notice(c, fmt, ...) \
-       bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn(c, fmt, ...) \
-       bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn_ratelimited(c, fmt, ...) \
-       bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
-
-#define bch_err(c, fmt, ...) \
-       bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err_dev(ca, fmt, ...) \
-       bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset(ca, _offset, fmt, ...) \
-       bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum(c, _inum, fmt, ...) \
-       bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
-       bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-#define bch_err_ratelimited(c, fmt, ...) \
-       bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err_dev_ratelimited(ca, fmt, ...) \
-       bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
-       bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
-       bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
-       bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-static inline bool should_print_err(int err)
-{
-       return err && !bch2_err_matches(err, BCH_ERR_transaction_restart);
-}
-
-#define bch_err_fn(_c, _ret)                                           \
-do {                                                                   \
-       if (should_print_err(_ret))                                     \
-               bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
-} while (0)
-
-#define bch_err_fn_ratelimited(_c, _ret)                               \
-do {                                                                   \
-       if (should_print_err(_ret))                                     \
-               bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
-} while (0)
-
-#define bch_err_msg(_c, _ret, _msg, ...)                               \
-do {                                                                   \
-       if (should_print_err(_ret))                                     \
-               bch_err(_c, "%s(): error " _msg " %s", __func__,        \
-                       ##__VA_ARGS__, bch2_err_str(_ret));             \
-} while (0)
-
-#define bch_verbose(c, fmt, ...)                                       \
-do {                                                                   \
-       if ((c)->opts.verbose)                                          \
-               bch_info(c, fmt, ##__VA_ARGS__);                        \
-} while (0)
-
-#define bch_verbose_ratelimited(c, fmt, ...)                           \
-do {                                                                   \
-       if ((c)->opts.verbose)                                          \
-               bch_info_ratelimited(c, fmt, ##__VA_ARGS__);            \
-} while (0)
-
-#define pr_verbose_init(opts, fmt, ...)                                        \
-do {                                                                   \
-       if (opt_get(opts, verbose))                                     \
-               pr_info(fmt, ##__VA_ARGS__);                            \
-} while (0)
-
-static inline int __bch2_err_trace(struct bch_fs *c, int err)
-{
-       trace_error_throw(c, err, _THIS_IP_);
-       return err;
-}
-
-#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
-
-/* Parameters that are useful for debugging, but should always be compiled in: */
-#define BCH_DEBUG_PARAMS_ALWAYS()                                      \
-       BCH_DEBUG_PARAM(key_merging_disabled,                           \
-               "Disables merging of extents")                          \
-       BCH_DEBUG_PARAM(btree_node_merging_disabled,                    \
-               "Disables merging of btree nodes")                      \
-       BCH_DEBUG_PARAM(btree_gc_always_rewrite,                        \
-               "Causes mark and sweep to compact and rewrite every "   \
-               "btree node it traverses")                              \
-       BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,                      \
-               "Disables rewriting of btree nodes during mark and sweep")\
-       BCH_DEBUG_PARAM(btree_shrinker_disabled,                        \
-               "Disables the shrinker callback for the btree node cache")\
-       BCH_DEBUG_PARAM(verify_btree_ondisk,                            \
-               "Reread btree nodes at various points to verify the "   \
-               "mergesort in the read path against modifications "     \
-               "done in memory")                                       \
-       BCH_DEBUG_PARAM(verify_all_btree_replicas,                      \
-               "When reading btree nodes, read all replicas and "      \
-               "compare them")                                         \
-       BCH_DEBUG_PARAM(backpointers_no_use_write_buffer,               \
-               "Don't use the write buffer for backpointers, enabling "\
-               "extra runtime checks")                                 \
-       BCH_DEBUG_PARAM(debug_check_btree_locking,                      \
-               "Enable additional asserts for btree locking")          \
-       BCH_DEBUG_PARAM(debug_check_iterators,                          \
-               "Enables extra verification for btree iterators")       \
-       BCH_DEBUG_PARAM(debug_check_bset_lookups,                       \
-               "Enables extra verification for bset lookups")          \
-       BCH_DEBUG_PARAM(debug_check_btree_accounting,                   \
-               "Verify btree accounting for keys within a node")       \
-       BCH_DEBUG_PARAM(debug_check_bkey_unpack,                        \
-               "Enables extra verification for bkey unpack")
-
-/* Parameters that should only be compiled in debug mode: */
-#define BCH_DEBUG_PARAMS_DEBUG()                                       \
-       BCH_DEBUG_PARAM(journal_seq_verify,                             \
-               "Store the journal sequence number in the version "     \
-               "number of every btree key, and verify that btree "     \
-               "update ordering is preserved during recovery")         \
-       BCH_DEBUG_PARAM(inject_invalid_keys,                            \
-               "Store the journal sequence number in the version "     \
-               "number of every btree key, and verify that btree "     \
-               "update ordering is preserved during recovery")         \
-       BCH_DEBUG_PARAM(test_alloc_startup,                             \
-               "Force allocator startup to use the slowpath where it"  \
-               "can't find enough free buckets without invalidating"   \
-               "cached data")                                          \
-       BCH_DEBUG_PARAM(force_reconstruct_read,                         \
-               "Force reads to use the reconstruct path, when reading" \
-               "from erasure coded extents")                           \
-       BCH_DEBUG_PARAM(test_restart_gc,                                \
-               "Test restarting mark and sweep gc when bucket gens change")
-
-#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
-#else
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
-#endif
-
-#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name;
-BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_TIME_STATS()                       \
-       x(btree_node_mem_alloc)                 \
-       x(btree_node_split)                     \
-       x(btree_node_compact)                   \
-       x(btree_node_merge)                     \
-       x(btree_node_sort)                      \
-       x(btree_node_get)                       \
-       x(btree_node_read)                      \
-       x(btree_node_read_done)                 \
-       x(btree_node_write)                     \
-       x(btree_interior_update_foreground)     \
-       x(btree_interior_update_total)          \
-       x(btree_gc)                             \
-       x(data_write)                           \
-       x(data_write_to_submit)                 \
-       x(data_write_to_queue)                  \
-       x(data_write_to_btree_update)           \
-       x(data_write_btree_update)              \
-       x(data_read)                            \
-       x(data_promote)                         \
-       x(journal_flush_write)                  \
-       x(journal_noflush_write)                \
-       x(journal_flush_seq)                    \
-       x(blocked_journal_low_on_space)         \
-       x(blocked_journal_low_on_pin)           \
-       x(blocked_journal_max_in_flight)        \
-       x(blocked_journal_max_open)             \
-       x(blocked_key_cache_flush)              \
-       x(blocked_allocate)                     \
-       x(blocked_allocate_open_bucket)         \
-       x(blocked_write_buffer_full)            \
-       x(nocow_lock_contended)
-
-enum bch_time_stats {
-#define x(name) BCH_TIME_##name,
-       BCH_TIME_STATS()
-#undef x
-       BCH_TIME_STAT_NR
-};
-
-/* Number of nodes btree coalesce will try to coalesce at once */
-#define GC_MERGE_NODES         4U
-
-/* Maximum number of nodes we might need to allocate atomically: */
-#define BTREE_RESERVE_MAX      (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
-
-/* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE     (BTREE_RESERVE_MAX * 4)
-
-#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
-
-struct btree;
-
-struct io_count {
-       u64                     sectors[2][BCH_DATA_NR];
-};
-
-struct discard_in_flight {
-       bool                    in_progress:1;
-       u64                     bucket:63;
-};
-
-#define BCH_DEV_READ_REFS()                            \
-       x(bch2_online_devs)                             \
-       x(trans_mark_dev_sbs)                           \
-       x(read_fua_test)                                \
-       x(sb_field_resize)                              \
-       x(write_super)                                  \
-       x(journal_read)                                 \
-       x(fs_journal_alloc)                             \
-       x(fs_resize_on_mount)                           \
-       x(btree_node_read)                              \
-       x(btree_node_read_all_replicas)                 \
-       x(btree_node_scrub)                             \
-       x(btree_node_write)                             \
-       x(btree_node_scan)                              \
-       x(btree_verify_replicas)                        \
-       x(btree_node_ondisk_to_text)                    \
-       x(io_read)                                      \
-       x(check_extent_checksums)                       \
-       x(ec_block)
-
-enum bch_dev_read_ref {
-#define x(n) BCH_DEV_READ_REF_##n,
-       BCH_DEV_READ_REFS()
-#undef x
-       BCH_DEV_READ_REF_NR,
-};
-
-#define BCH_DEV_WRITE_REFS()                           \
-       x(journal_write)                                \
-       x(journal_do_discards)                          \
-       x(dev_do_discards)                              \
-       x(discard_one_bucket_fast)                      \
-       x(do_invalidates)                               \
-       x(nocow_flush)                                  \
-       x(io_write)                                     \
-       x(ec_block)                                     \
-       x(ec_bucket_zero)
-
-enum bch_dev_write_ref {
-#define x(n) BCH_DEV_WRITE_REF_##n,
-       BCH_DEV_WRITE_REFS()
-#undef x
-       BCH_DEV_WRITE_REF_NR,
-};
-
-struct bucket_bitmap {
-       unsigned long           *buckets;
-       u64                     nr;
-       struct mutex            lock;
-};
-
-struct bch_dev {
-       struct kobject          kobj;
-#ifdef CONFIG_BCACHEFS_DEBUG
-       atomic_long_t           ref;
-       bool                    dying;
-       unsigned long           last_put;
-#else
-       struct percpu_ref       ref;
-#endif
-       struct completion       ref_completion;
-       struct enumerated_ref   io_ref[2];
-
-       struct bch_fs           *fs;
-
-       u8                      dev_idx;
-       /*
-        * Cached version of this device's member info from superblock
-        * Committed by bch2_write_super() -> bch_fs_mi_update()
-        */
-       struct bch_member_cpu   mi;
-       atomic64_t              errors[BCH_MEMBER_ERROR_NR];
-       unsigned long           write_errors_start;
-
-       __uuid_t                uuid;
-       char                    name[BDEVNAME_SIZE];
-
-       struct bch_sb_handle    disk_sb;
-       struct bch_sb           *sb_read_scratch;
-       int                     sb_write_error;
-       dev_t                   dev;
-       atomic_t                flush_seq;
-
-       struct bch_devs_mask    self;
-
-       /*
-        * Buckets:
-        * Per-bucket arrays are protected by either rcu_read_lock or
-        * state_lock, for device resize.
-        */
-       GENRADIX(struct bucket) buckets_gc;
-       struct bucket_gens __rcu *bucket_gens;
-       u8                      *oldest_gen;
-       unsigned long           *buckets_nouse;
-
-       struct bucket_bitmap    bucket_backpointer_mismatch;
-       struct bucket_bitmap    bucket_backpointer_empty;
-
-       struct bch_dev_usage_full __percpu
-                               *usage;
-
-       /* Allocator: */
-       u64                     alloc_cursor[3];
-
-       unsigned                nr_open_buckets;
-       unsigned                nr_partial_buckets;
-       unsigned                nr_btree_reserve;
-
-       struct work_struct      invalidate_work;
-       struct work_struct      discard_work;
-       struct mutex            discard_buckets_in_flight_lock;
-       DARRAY(struct discard_in_flight)        discard_buckets_in_flight;
-       struct work_struct      discard_fast_work;
-
-       atomic64_t              rebalance_work;
-
-       struct journal_device   journal;
-       u64                     prev_journal_sector;
-
-       struct work_struct      io_error_work;
-
-       /* The rest of this all shows up in sysfs */
-       atomic64_t              cur_latency[2];
-       struct bch2_time_stats_quantiles io_latency[2];
-
-#define CONGESTED_MAX          1024
-       atomic_t                congested;
-       u64                     congested_last;
-
-       struct io_count __percpu *io_done;
-};
-
-/*
- * initial_gc_unfixed
- * error
- * topology error
- */
-
-#define BCH_FS_FLAGS()                 \
-       x(new_fs)                       \
-       x(started)                      \
-       x(clean_recovery)               \
-       x(btree_running)                \
-       x(accounting_replay_done)       \
-       x(may_go_rw)                    \
-       x(rw)                           \
-       x(rw_init_done)                 \
-       x(was_rw)                       \
-       x(stopping)                     \
-       x(emergency_ro)                 \
-       x(going_ro)                     \
-       x(write_disable_complete)       \
-       x(clean_shutdown)               \
-       x(in_recovery)                  \
-       x(in_fsck)                      \
-       x(initial_gc_unfixed)           \
-       x(need_delete_dead_snapshots)   \
-       x(error)                        \
-       x(topology_error)               \
-       x(errors_fixed)                 \
-       x(errors_not_fixed)             \
-       x(no_invalid_checks)            \
-       x(discard_mount_opt_set)        \
-
-enum bch_fs_flags {
-#define x(n)           BCH_FS_##n,
-       BCH_FS_FLAGS()
-#undef x
-};
-
-struct btree_debug {
-       unsigned                id;
-};
-
-#define BCH_TRANSACTIONS_NR 128
-
-struct btree_transaction_stats {
-       struct bch2_time_stats  duration;
-       struct bch2_time_stats  lock_hold_times;
-       struct mutex            lock;
-       unsigned                nr_max_paths;
-       unsigned                max_mem;
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-       darray_trans_kmalloc_trace trans_kmalloc_trace;
-#endif
-       char                    *max_paths_text;
-};
-
-struct bch_fs_pcpu {
-       u64                     sectors_available;
-};
-
-struct journal_seq_blacklist_table {
-       size_t                  nr;
-       struct journal_seq_blacklist_table_entry {
-               u64             start;
-               u64             end;
-               bool            dirty;
-       }                       entries[];
-};
-
-struct btree_trans_buf {
-       struct btree_trans      *trans;
-};
-
-#define BCH_WRITE_REFS()                                               \
-       x(journal)                                                      \
-       x(trans)                                                        \
-       x(write)                                                        \
-       x(promote)                                                      \
-       x(node_rewrite)                                                 \
-       x(stripe_create)                                                \
-       x(stripe_delete)                                                \
-       x(reflink)                                                      \
-       x(fallocate)                                                    \
-       x(fsync)                                                        \
-       x(dio_write)                                                    \
-       x(discard)                                                      \
-       x(discard_fast)                                                 \
-       x(check_discard_freespace_key)                                  \
-       x(invalidate)                                                   \
-       x(delete_dead_snapshots)                                        \
-       x(gc_gens)                                                      \
-       x(snapshot_delete_pagecache)                                    \
-       x(sysfs)                                                        \
-       x(btree_write_buffer)                                           \
-       x(btree_node_scrub)                                             \
-       x(async_recovery_passes)                                        \
-       x(ioctl_data)
-
-enum bch_write_ref {
-#define x(n) BCH_WRITE_REF_##n,
-       BCH_WRITE_REFS()
-#undef x
-       BCH_WRITE_REF_NR,
-};
-
-#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0)
-
-struct bch_fs {
-       struct closure          cl;
-
-       struct list_head        list;
-       struct kobject          kobj;
-       struct kobject          counters_kobj;
-       struct kobject          internal;
-       struct kobject          opts_dir;
-       struct kobject          time_stats;
-       unsigned long           flags;
-
-       int                     minor;
-       struct device           *chardev;
-       struct super_block      *vfs_sb;
-       dev_t                   dev;
-       char                    name[40];
-       struct stdio_redirect   *stdio;
-       struct task_struct      *stdio_filter;
-
-       /* ro/rw, add/remove/resize devices: */
-       struct rw_semaphore     state_lock;
-
-       /* Counts outstanding writes, for clean transition to read-only */
-       struct enumerated_ref   writes;
-       /*
-        * Certain operations are only allowed in single threaded mode, during
-        * recovery, and we want to assert that this is the case:
-        */
-       struct task_struct      *recovery_task;
-
-       /*
-        * Analagous to c->writes, for asynchronous ops that don't necessarily
-        * need fs to be read-write
-        */
-       refcount_t              ro_ref;
-       wait_queue_head_t       ro_ref_wait;
-
-       struct work_struct      read_only_work;
-
-       struct bch_dev __rcu    *devs[BCH_SB_MEMBERS_MAX];
-
-       struct bch_accounting_mem accounting;
-
-       struct bch_replicas_cpu replicas;
-       struct bch_replicas_cpu replicas_gc;
-       struct mutex            replicas_gc_lock;
-
-       struct journal_entry_res btree_root_journal_res;
-       struct journal_entry_res clock_journal_res;
-
-       struct bch_disk_groups_cpu __rcu *disk_groups;
-
-       struct bch_opts         opts;
-
-       /* Updated by bch2_sb_update():*/
-       struct {
-               __uuid_t        uuid;
-               __uuid_t        user_uuid;
-
-               u16             version;
-               u16             version_incompat;
-               u16             version_incompat_allowed;
-               u16             version_min;
-               u16             version_upgrade_complete;
-
-               u8              nr_devices;
-               u8              clean;
-               bool            multi_device; /* true if we've ever had more than one device */
-
-               u8              encryption_type;
-
-               u64             time_base_lo;
-               u32             time_base_hi;
-               unsigned        time_units_per_sec;
-               unsigned        nsec_per_time_unit;
-               u64             features;
-               u64             compat;
-               u64             recovery_passes_required;
-               unsigned long   errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
-               u64             btrees_lost_data;
-       }                       sb;
-       DARRAY(enum bcachefs_metadata_version)
-                               incompat_versions_requested;
-
-       struct unicode_map      *cf_encoding;
-
-       struct bch_sb_handle    disk_sb;
-
-       unsigned short          block_bits;     /* ilog2(block_size) */
-
-       u16                     btree_foreground_merge_threshold;
-
-       struct closure          sb_write;
-       struct mutex            sb_lock;
-
-       /* snapshot.c: */
-       struct snapshot_table __rcu *snapshots;
-       struct mutex            snapshot_table_lock;
-       struct rw_semaphore     snapshot_create_lock;
-
-       struct snapshot_delete  snapshot_delete;
-       struct work_struct      snapshot_wait_for_pagecache_and_delete_work;
-       snapshot_id_list        snapshots_unlinked;
-       struct mutex            snapshots_unlinked_lock;
-
-       /* BTREE CACHE */
-       struct bio_set          btree_bio;
-       struct workqueue_struct *btree_read_complete_wq;
-       struct workqueue_struct *btree_write_submit_wq;
-
-       struct btree_root       btree_roots_known[BTREE_ID_NR];
-       DARRAY(struct btree_root) btree_roots_extra;
-       struct mutex            btree_root_lock;
-
-       struct btree_cache      btree_cache;
-
-       /*
-        * Cache of allocated btree nodes - if we allocate a btree node and
-        * don't use it, if we free it that space can't be reused until going
-        * _all_ the way through the allocator (which exposes us to a livelock
-        * when allocating btree reserves fail halfway through) - instead, we
-        * can stick them here:
-        */
-       struct btree_alloc      btree_reserve_cache[BTREE_NODE_RESERVE * 2];
-       unsigned                btree_reserve_cache_nr;
-       struct mutex            btree_reserve_cache_lock;
-
-       mempool_t               btree_interior_update_pool;
-       struct list_head        btree_interior_update_list;
-       struct list_head        btree_interior_updates_unwritten;
-       struct mutex            btree_interior_update_lock;
-       struct closure_waitlist btree_interior_update_wait;
-
-       struct workqueue_struct *btree_interior_update_worker;
-       struct work_struct      btree_interior_update_work;
-
-       struct workqueue_struct *btree_node_rewrite_worker;
-       struct list_head        btree_node_rewrites;
-       struct list_head        btree_node_rewrites_pending;
-       spinlock_t              btree_node_rewrites_lock;
-       struct closure_waitlist btree_node_rewrites_wait;
-
-       /* btree_io.c: */
-       spinlock_t              btree_write_error_lock;
-       struct btree_write_stats {
-               atomic64_t      nr;
-               atomic64_t      bytes;
-       }                       btree_write_stats[BTREE_WRITE_TYPE_NR];
-
-       /* btree_iter.c: */
-       struct seqmutex         btree_trans_lock;
-       struct list_head        btree_trans_list;
-       mempool_t               btree_trans_pool;
-       mempool_t               btree_trans_mem_pool;
-       struct btree_trans_buf  __percpu        *btree_trans_bufs;
-
-       struct srcu_struct      btree_trans_barrier;
-       bool                    btree_trans_barrier_initialized;
-
-       struct btree_key_cache  btree_key_cache;
-       unsigned                btree_key_cache_btrees;
-
-       struct btree_write_buffer btree_write_buffer;
-
-       struct workqueue_struct *btree_update_wq;
-       struct workqueue_struct *btree_write_complete_wq;
-       /* copygc needs its own workqueue for index updates.. */
-       struct workqueue_struct *copygc_wq;
-       /*
-        * Use a dedicated wq for write ref holder tasks. Required to avoid
-        * dependency problems with other wq tasks that can block on ref
-        * draining, such as read-only transition.
-        */
-       struct workqueue_struct *write_ref_wq;
-
-       /* ALLOCATION */
-       struct bch_devs_mask    online_devs;
-       struct bch_devs_mask    rw_devs[BCH_DATA_NR];
-       unsigned long           rw_devs_change_count;
-
-       u64                     capacity; /* sectors */
-       u64                     reserved; /* sectors */
-
-       /*
-        * When capacity _decreases_ (due to a disk being removed), we
-        * increment capacity_gen - this invalidates outstanding reservations
-        * and forces them to be revalidated
-        */
-       u32                     capacity_gen;
-       unsigned                bucket_size_max;
-
-       atomic64_t              sectors_available;
-       struct mutex            sectors_available_lock;
-
-       struct bch_fs_pcpu __percpu     *pcpu;
-
-       struct percpu_rw_semaphore      mark_lock;
-
-       seqcount_t                      usage_lock;
-       struct bch_fs_usage_base __percpu *usage;
-       u64 __percpu            *online_reserved;
-
-       unsigned long           allocator_last_stuck;
-
-       struct io_clock         io_clock[2];
-
-       /* JOURNAL SEQ BLACKLIST */
-       struct journal_seq_blacklist_table *
-                               journal_seq_blacklist_table;
-
-       /* ALLOCATOR */
-       spinlock_t              freelist_lock;
-       struct closure_waitlist freelist_wait;
-
-       open_bucket_idx_t       open_buckets_freelist;
-       open_bucket_idx_t       open_buckets_nr_free;
-       struct closure_waitlist open_buckets_wait;
-       struct open_bucket      open_buckets[OPEN_BUCKETS_COUNT];
-       open_bucket_idx_t       open_buckets_hash[OPEN_BUCKETS_COUNT];
-
-       open_bucket_idx_t       open_buckets_partial[OPEN_BUCKETS_COUNT];
-       open_bucket_idx_t       open_buckets_partial_nr;
-
-       struct write_point      btree_write_point;
-       struct write_point      rebalance_write_point;
-
-       struct write_point      write_points[WRITE_POINT_MAX];
-       struct hlist_head       write_points_hash[WRITE_POINT_HASH_NR];
-       struct mutex            write_points_hash_lock;
-       unsigned                write_points_nr;
-
-       struct buckets_waiting_for_journal buckets_waiting_for_journal;
-
-       /* GARBAGE COLLECTION */
-       struct work_struct      gc_gens_work;
-       unsigned long           gc_count;
-
-       enum btree_id           gc_gens_btree;
-       struct bpos             gc_gens_pos;
-
-       /*
-        * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
-        * has been marked by GC.
-        *
-        * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
-        *
-        * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
-        * can read without a lock.
-        */
-       seqcount_t              gc_pos_lock;
-       struct gc_pos           gc_pos;
-
-       /*
-        * The allocation code needs gc_mark in struct bucket to be correct, but
-        * it's not while a gc is in progress.
-        */
-       struct rw_semaphore     gc_lock;
-       struct mutex            gc_gens_lock;
-
-       /* IO PATH */
-       struct semaphore        io_in_flight;
-       struct bio_set          bio_read;
-       struct bio_set          bio_read_split;
-       struct bio_set          bio_write;
-       struct bio_set          replica_set;
-       struct mutex            bio_bounce_pages_lock;
-       mempool_t               bio_bounce_pages;
-       struct bucket_nocow_lock_table
-                               nocow_locks;
-       struct rhashtable       promote_table;
-
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-       struct async_obj_list   async_objs[BCH_ASYNC_OBJ_NR];
-#endif
-
-       mempool_t               compression_bounce[2];
-       mempool_t               compress_workspace[BCH_COMPRESSION_OPT_NR];
-       size_t                  zstd_workspace_size;
-
-       struct bch_key          chacha20_key;
-       bool                    chacha20_key_set;
-
-       atomic64_t              key_version;
-
-       mempool_t               large_bkey_pool;
-
-       /* MOVE.C */
-       struct list_head        moving_context_list;
-       struct mutex            moving_context_lock;
-
-       /* REBALANCE */
-       struct bch_fs_rebalance rebalance;
-
-       /* COPYGC */
-       struct task_struct      *copygc_thread;
-       struct write_point      copygc_write_point;
-       s64                     copygc_wait_at;
-       s64                     copygc_wait;
-       bool                    copygc_running;
-       wait_queue_head_t       copygc_running_wq;
-
-       /* STRIPES: */
-       GENRADIX(struct gc_stripe) gc_stripes;
-
-       struct hlist_head       ec_stripes_new[32];
-       spinlock_t              ec_stripes_new_lock;
-
-       /* ERASURE CODING */
-       struct list_head        ec_stripe_head_list;
-       struct mutex            ec_stripe_head_lock;
-
-       struct list_head        ec_stripe_new_list;
-       struct mutex            ec_stripe_new_lock;
-       wait_queue_head_t       ec_stripe_new_wait;
-
-       struct work_struct      ec_stripe_create_work;
-       u64                     ec_stripe_hint;
-
-       struct work_struct      ec_stripe_delete_work;
-
-       struct bio_set          ec_bioset;
-
-       /* REFLINK */
-       reflink_gc_table        reflink_gc_table;
-       size_t                  reflink_gc_nr;
-
-       /* fs.c */
-       struct list_head        vfs_inodes_list;
-       struct mutex            vfs_inodes_lock;
-       struct rhashtable       vfs_inodes_table;
-       struct rhltable         vfs_inodes_by_inum_table;
-
-       /* VFS IO PATH - fs-io.c */
-       struct bio_set          writepage_bioset;
-       struct bio_set          dio_write_bioset;
-       struct bio_set          dio_read_bioset;
-       struct bio_set          nocow_flush_bioset;
-
-       /* QUOTAS */
-       struct bch_memquota_type quotas[QTYP_NR];
-
-       /* RECOVERY */
-       u64                     journal_replay_seq_start;
-       u64                     journal_replay_seq_end;
-       struct bch_fs_recovery  recovery;
-
-       /* DEBUG JUNK */
-       struct dentry           *fs_debug_dir;
-       struct dentry           *btree_debug_dir;
-       struct dentry           *async_obj_dir;
-       struct btree_debug      btree_debug[BTREE_ID_NR];
-       struct btree            *verify_data;
-       struct btree_node       *verify_ondisk;
-       struct mutex            verify_lock;
-
-       /*
-        * A btree node on disk could have too many bsets for an iterator to fit
-        * on the stack - have to dynamically allocate them
-        */
-       mempool_t               fill_iter;
-
-       mempool_t               btree_bounce_pool;
-
-       struct journal          journal;
-       GENRADIX(struct journal_replay *) journal_entries;
-       u64                     journal_entries_base_seq;
-       struct journal_keys     journal_keys;
-       struct list_head        journal_iters;
-
-       struct find_btree_nodes found_btree_nodes;
-
-       u64                     last_bucket_seq_cleanup;
-
-       u64                     counters_on_mount[BCH_COUNTER_NR];
-       u64 __percpu            *counters;
-
-       struct bch2_time_stats  times[BCH_TIME_STAT_NR];
-
-       struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
-
-       /* ERRORS */
-       struct list_head        fsck_error_msgs;
-       struct mutex            fsck_error_msgs_lock;
-       bool                    fsck_alloc_msgs_err;
-
-       bch_sb_errors_cpu       fsck_error_counts;
-       struct mutex            fsck_error_counts_lock;
-};
-
-extern struct wait_queue_head bch2_read_only_wait;
-
-static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
-{
-       if (test_bit(BCH_FS_stopping, &c->flags))
-               return false;
-
-       return refcount_inc_not_zero(&c->ro_ref);
-}
-
-static inline void bch2_ro_ref_put(struct bch_fs *c)
-{
-       if (refcount_dec_and_test(&c->ro_ref))
-               wake_up(&c->ro_ref_wait);
-}
-
-static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
-{
-#ifndef NO_BCACHEFS_FS
-       if (c->vfs_sb)
-               c->vfs_sb->s_bdi->ra_pages = ra_pages;
-#endif
-}
-
-static inline unsigned bucket_bytes(const struct bch_dev *ca)
-{
-       return ca->mi.bucket_size << 9;
-}
-
-static inline unsigned block_bytes(const struct bch_fs *c)
-{
-       return c->opts.block_size;
-}
-
-static inline unsigned block_sectors(const struct bch_fs *c)
-{
-       return c->opts.block_size >> 9;
-}
-
-static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
-{
-       return c->btree_key_cache_btrees & (1U << btree);
-}
-
-static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
-{
-       struct timespec64 t;
-       s64 sec;
-       s32 rem;
-
-       time += c->sb.time_base_lo;
-
-       sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
-
-       set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
-
-       return t;
-}
-
-static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
-{
-       return (ts.tv_sec * c->sb.time_units_per_sec +
-               (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
-}
-
-static inline s64 bch2_current_time(const struct bch_fs *c)
-{
-       struct timespec64 now;
-
-       ktime_get_coarse_real_ts64(&now);
-       return timespec_to_bch2_time(c, now);
-}
-
-static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
-{
-       return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX);
-}
-
-static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
-{
-       struct stdio_redirect *stdio = c->stdio;
-
-       if (c->stdio_filter && c->stdio_filter != current)
-               stdio = NULL;
-       return stdio;
-}
-
-static inline unsigned metadata_replicas_required(struct bch_fs *c)
-{
-       return min(c->opts.metadata_replicas,
-                  c->opts.metadata_replicas_required);
-}
-
-static inline unsigned data_replicas_required(struct bch_fs *c)
-{
-       return min(c->opts.data_replicas,
-                  c->opts.data_replicas_required);
-}
-
-#define BKEY_PADDED_ONSTACK(key, pad)                          \
-       struct { struct bkey_i key; __u64 key ## _pad[pad]; }
-
-/*
- * This is needed because discard is both a filesystem option and a device
- * option, and mount options are supposed to apply to that mount and not be
- * persisted, i.e. if it's set as a mount option we can't propagate it to the
- * device.
- */
-static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca)
-{
-       return test_bit(BCH_FS_discard_mount_opt_set, &c->flags)
-               ? c->opts.discard
-               : ca->mi.discard;
-}
-
-static inline bool bch2_fs_casefold_enabled(struct bch_fs *c)
-{
-#ifdef CONFIG_UNICODE
-       return !c->opts.casefold_disabled;
-#else
-       return false;
-#endif
-}
-
-#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
deleted file mode 100644 (file)
index b4a04df..0000000
+++ /dev/null
@@ -1,1545 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FORMAT_H
-#define _BCACHEFS_FORMAT_H
-
-/*
- * bcachefs on disk data structures
- *
- * OVERVIEW:
- *
- * There are three main types of on disk data structures in bcachefs (this is
- * reduced from 5 in bcache)
- *
- *  - superblock
- *  - journal
- *  - btree
- *
- * The btree is the primary structure; most metadata exists as keys in the
- * various btrees. There are only a small number of btrees, they're not
- * sharded - we have one btree for extents, another for inodes, et cetera.
- *
- * SUPERBLOCK:
- *
- * The superblock contains the location of the journal, the list of devices in
- * the filesystem, and in general any metadata we need in order to decide
- * whether we can start a filesystem or prior to reading the journal/btree
- * roots.
- *
- * The superblock is extensible, and most of the contents of the superblock are
- * in variable length, type tagged fields; see struct bch_sb_field.
- *
- * Backup superblocks do not reside in a fixed location; also, superblocks do
- * not have a fixed size. To locate backup superblocks we have struct
- * bch_sb_layout; we store a copy of this inside every superblock, and also
- * before the first superblock.
- *
- * JOURNAL:
- *
- * The journal primarily records btree updates in the order they occurred;
- * journal replay consists of just iterating over all the keys in the open
- * journal entries and re-inserting them into the btrees.
- *
- * The journal also contains entry types for the btree roots, and blacklisted
- * journal sequence numbers (see journal_seq_blacklist.c).
- *
- * BTREE:
- *
- * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
- * 128k-256k) and log structured. We use struct btree_node for writing the first
- * entry in a given node (offset 0), and struct btree_node_entry for all
- * subsequent writes.
- *
- * After the header, btree node entries contain a list of keys in sorted order.
- * Values are stored inline with the keys; since values are variable length (and
- * keys effectively are variable length too, due to packing) we can't do random
- * access without building up additional in memory tables in the btree node read
- * path.
- *
- * BTREE KEYS (struct bkey):
- *
- * The various btrees share a common format for the key - so as to avoid
- * switching in fastpath lookup/comparison code - but define their own
- * structures for the key values.
- *
- * The size of a key/value pair is stored as a u8 in units of u64s, so the max
- * size is just under 2k. The common part also contains a type tag for the
- * value, and a format field indicating whether the key is packed or not (and
- * also meant to allow adding new key fields in the future, if desired).
- *
- * bkeys, when stored within a btree node, may also be packed. In that case, the
- * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
- * be generous with field sizes in the common part of the key format (64 bit
- * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
- */
-
-#include <asm/types.h>
-#include <asm/byteorder.h>
-#include <linux/kernel.h>
-#include <linux/uuid.h>
-#include <uapi/linux/magic.h>
-#include "vstructs.h"
-
-#ifdef __KERNEL__
-typedef uuid_t __uuid_t;
-#endif
-
-#define BITMASK(name, type, field, offset, end)                                \
-static const __maybe_unused unsigned   name##_OFFSET = offset;         \
-static const __maybe_unused unsigned   name##_BITS = (end - offset);   \
-                                                                       \
-static inline __u64 name(const type *k)                                        \
-{                                                                      \
-       return (k->field >> offset) & ~(~0ULL << (end - offset));       \
-}                                                                      \
-                                                                       \
-static inline void SET_##name(type *k, __u64 v)                                \
-{                                                                      \
-       k->field &= ~(~(~0ULL << (end - offset)) << offset);            \
-       k->field |= (v & ~(~0ULL << (end - offset))) << offset;         \
-}
-
-#define LE_BITMASK(_bits, name, type, field, offset, end)              \
-static const __maybe_unused unsigned   name##_OFFSET = offset;         \
-static const __maybe_unused unsigned   name##_BITS = (end - offset);   \
-static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\
-                                                                       \
-static inline __u64 name(const type *k)                                        \
-{                                                                      \
-       return (__le##_bits##_to_cpu(k->field) >> offset) &             \
-               ~(~0ULL << (end - offset));                             \
-}                                                                      \
-                                                                       \
-static inline void SET_##name(type *k, __u64 v)                                \
-{                                                                      \
-       __u##_bits new = __le##_bits##_to_cpu(k->field);                \
-                                                                       \
-       new &= ~(~(~0ULL << (end - offset)) << offset);                 \
-       new |= (v & ~(~0ULL << (end - offset))) << offset;              \
-       k->field = __cpu_to_le##_bits(new);                             \
-}
-
-#define LE16_BITMASK(n, t, f, o, e)    LE_BITMASK(16, n, t, f, o, e)
-#define LE32_BITMASK(n, t, f, o, e)    LE_BITMASK(32, n, t, f, o, e)
-#define LE64_BITMASK(n, t, f, o, e)    LE_BITMASK(64, n, t, f, o, e)
-
-struct bkey_format {
-       __u8            key_u64s;
-       __u8            nr_fields;
-       /* One unused slot for now: */
-       __u8            bits_per_field[6];
-       __le64          field_offset[6];
-};
-
-/* Btree keys - all units are in sectors */
-
-struct bpos {
-       /*
-        * Word order matches machine byte order - btree code treats a bpos as a
-        * single large integer, for search/comparison purposes
-        *
-        * Note that wherever a bpos is embedded in another on disk data
-        * structure, it has to be byte swabbed when reading in metadata that
-        * wasn't written in native endian order:
-        */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-       __u32           snapshot;
-       __u64           offset;
-       __u64           inode;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-       __u64           inode;
-       __u64           offset;         /* Points to end of extent - sectors */
-       __u32           snapshot;
-#else
-#error edit for your odd byteorder.
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-__aligned(4)
-#endif
-;
-
-#define KEY_INODE_MAX                  ((__u64)~0ULL)
-#define KEY_OFFSET_MAX                 ((__u64)~0ULL)
-#define KEY_SNAPSHOT_MAX               ((__u32)~0U)
-#define KEY_SIZE_MAX                   ((__u32)~0U)
-
-static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
-{
-       return (struct bpos) {
-               .inode          = inode,
-               .offset         = offset,
-               .snapshot       = snapshot,
-       };
-}
-
-#define POS_MIN                                SPOS(0, 0, 0)
-#define POS_MAX                                SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
-#define SPOS_MAX                       SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
-#define POS(_inode, _offset)           SPOS(_inode, _offset, 0)
-
-/* Empty placeholder struct, for container_of() */
-struct bch_val {
-       __u64           __nothing[0];
-};
-
-struct bversion {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-       __u64           lo;
-       __u32           hi;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-       __u32           hi;
-       __u64           lo;
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-__aligned(4)
-#endif
-;
-
-struct bkey {
-       /* Size of combined key and value, in u64s */
-       __u8            u64s;
-
-       /* Format of key (0 for format local to btree node) */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u8            format:7,
-                       needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u8            needs_whiteout:1,
-                       format:7;
-#else
-#error edit for your odd byteorder.
-#endif
-
-       /* Type of the value */
-       __u8            type;
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-       __u8            pad[1];
-
-       struct bversion bversion;
-       __u32           size;           /* extent size, in sectors */
-       struct bpos     p;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-       struct bpos     p;
-       __u32           size;           /* extent size, in sectors */
-       struct bversion bversion;
-
-       __u8            pad[1];
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-/*
- * The big-endian version of bkey can't be compiled by rustc with the "aligned"
- * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
- * So for Rust compatibility, don't include this. It can be included in the LE
- * version because the "packed" attr is redundant in that case.
- *
- * History: (quoting Kent)
- *
- * Specifically, when i was designing bkey, I wanted the header to be no
- * bigger than necessary so that bkey_packed could use the rest. That means that
- * decently offten extent keys will fit into only 8 bytes, instead of spilling over
- * to 16.
- *
- * But packed_bkey treats the part after the header - the packed section -
- * as a single multi word, variable length integer. And bkey, the unpacked
- * version, is just a special case version of a bkey_packed; all the packed
- * bkey code will work on keys in any packed format, the in-memory
- * representation of an unpacked key also is just one type of packed key...
- *
- * So that constrains the key part of a bkig endian bkey to start right
- * after the header.
- *
- * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
- * some reason - that will clean up this wart.
- */
-__aligned(8)
-#endif
-;
-
-struct bkey_packed {
-       __u64           _data[0];
-
-       /* Size of combined key and value, in u64s */
-       __u8            u64s;
-
-       /* Format of key (0 for format local to btree node) */
-
-       /*
-        * XXX: next incompat on disk format change, switch format and
-        * needs_whiteout - bkey_packed() will be cheaper if format is the high
-        * bits of the bitfield
-        */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u8            format:7,
-                       needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u8            needs_whiteout:1,
-                       format:7;
-#endif
-
-       /* Type of the value */
-       __u8            type;
-       __u8            key_start[0];
-
-       /*
-        * We copy bkeys with struct assignment in various places, and while
-        * that shouldn't be done with packed bkeys we can't disallow it in C,
-        * and it's legal to cast a bkey to a bkey_packed  - so padding it out
-        * to the same size as struct bkey should hopefully be safest.
-        */
-       __u8            pad[sizeof(struct bkey) - 3];
-} __packed __aligned(8);
-
-typedef struct {
-       __le64                  lo;
-       __le64                  hi;
-} bch_le128;
-
-#define BKEY_U64s                      (sizeof(struct bkey) / sizeof(__u64))
-#define BKEY_U64s_MAX                  U8_MAX
-#define BKEY_VAL_U64s_MAX              (BKEY_U64s_MAX - BKEY_U64s)
-
-#define KEY_PACKED_BITS_START          24
-
-#define KEY_FORMAT_LOCAL_BTREE         0
-#define KEY_FORMAT_CURRENT             1
-
-enum bch_bkey_fields {
-       BKEY_FIELD_INODE,
-       BKEY_FIELD_OFFSET,
-       BKEY_FIELD_SNAPSHOT,
-       BKEY_FIELD_SIZE,
-       BKEY_FIELD_VERSION_HI,
-       BKEY_FIELD_VERSION_LO,
-       BKEY_NR_FIELDS,
-};
-
-#define bkey_format_field(name, field)                                 \
-       [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
-
-#define BKEY_FORMAT_CURRENT                                            \
-((struct bkey_format) {                                                        \
-       .key_u64s       = BKEY_U64s,                                    \
-       .nr_fields      = BKEY_NR_FIELDS,                               \
-       .bits_per_field = {                                             \
-               bkey_format_field(INODE,        p.inode),               \
-               bkey_format_field(OFFSET,       p.offset),              \
-               bkey_format_field(SNAPSHOT,     p.snapshot),            \
-               bkey_format_field(SIZE,         size),                  \
-               bkey_format_field(VERSION_HI,   bversion.hi),           \
-               bkey_format_field(VERSION_LO,   bversion.lo),           \
-       },                                                              \
-})
-
-/* bkey with inline value */
-struct bkey_i {
-       __u64                   _data[0];
-
-       struct bkey     k;
-       struct bch_val  v;
-};
-
-#define POS_KEY(_pos)                                                  \
-((struct bkey) {                                                       \
-       .u64s           = BKEY_U64s,                                    \
-       .format         = KEY_FORMAT_CURRENT,                           \
-       .p              = _pos,                                         \
-})
-
-#define KEY(_inode, _offset, _size)                                    \
-((struct bkey) {                                                       \
-       .u64s           = BKEY_U64s,                                    \
-       .format         = KEY_FORMAT_CURRENT,                           \
-       .p              = POS(_inode, _offset),                         \
-       .size           = _size,                                        \
-})
-
-static inline void bkey_init(struct bkey *k)
-{
-       *k = KEY(0, 0, 0);
-}
-
-#define bkey_bytes(_k)         ((_k)->u64s * sizeof(__u64))
-
-#define __BKEY_PADDED(key, pad)                                        \
-       struct bkey_i key; __u64 key ## _pad[pad]
-
-enum bch_bkey_type_flags {
-       BKEY_TYPE_strict_btree_checks   = BIT(0),
-};
-
-/*
- * - DELETED keys are used internally to mark keys that should be ignored but
- *   override keys in composition order.  Their version number is ignored.
- *
- * - DISCARDED keys indicate that the data is all 0s because it has been
- *   discarded. DISCARDs may have a version; if the version is nonzero the key
- *   will be persistent, otherwise the key will be dropped whenever the btree
- *   node is rewritten (like DELETED keys).
- *
- * - ERROR: any read of the data returns a read error, as the data was lost due
- *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
- *   by new writes or cluster-wide GC. Node repair can also overwrite them with
- *   the same or a more recent version number, but not with an older version
- *   number.
- *
- * - WHITEOUT: for hash table btrees
- */
-#define BCH_BKEY_TYPES()                                               \
-       x(deleted,              0,      0)                              \
-       x(whiteout,             1,      0)                              \
-       x(error,                2,      0)                              \
-       x(cookie,               3,      0)                              \
-       x(hash_whiteout,        4,      BKEY_TYPE_strict_btree_checks)  \
-       x(btree_ptr,            5,      BKEY_TYPE_strict_btree_checks)  \
-       x(extent,               6,      BKEY_TYPE_strict_btree_checks)  \
-       x(reservation,          7,      BKEY_TYPE_strict_btree_checks)  \
-       x(inode,                8,      BKEY_TYPE_strict_btree_checks)  \
-       x(inode_generation,     9,      BKEY_TYPE_strict_btree_checks)  \
-       x(dirent,               10,     BKEY_TYPE_strict_btree_checks)  \
-       x(xattr,                11,     BKEY_TYPE_strict_btree_checks)  \
-       x(alloc,                12,     BKEY_TYPE_strict_btree_checks)  \
-       x(quota,                13,     BKEY_TYPE_strict_btree_checks)  \
-       x(stripe,               14,     BKEY_TYPE_strict_btree_checks)  \
-       x(reflink_p,            15,     BKEY_TYPE_strict_btree_checks)  \
-       x(reflink_v,            16,     BKEY_TYPE_strict_btree_checks)  \
-       x(inline_data,          17,     BKEY_TYPE_strict_btree_checks)  \
-       x(btree_ptr_v2,         18,     BKEY_TYPE_strict_btree_checks)  \
-       x(indirect_inline_data, 19,     BKEY_TYPE_strict_btree_checks)  \
-       x(alloc_v2,             20,     BKEY_TYPE_strict_btree_checks)  \
-       x(subvolume,            21,     BKEY_TYPE_strict_btree_checks)  \
-       x(snapshot,             22,     BKEY_TYPE_strict_btree_checks)  \
-       x(inode_v2,             23,     BKEY_TYPE_strict_btree_checks)  \
-       x(alloc_v3,             24,     BKEY_TYPE_strict_btree_checks)  \
-       x(set,                  25,     0)                              \
-       x(lru,                  26,     BKEY_TYPE_strict_btree_checks)  \
-       x(alloc_v4,             27,     BKEY_TYPE_strict_btree_checks)  \
-       x(backpointer,          28,     BKEY_TYPE_strict_btree_checks)  \
-       x(inode_v3,             29,     BKEY_TYPE_strict_btree_checks)  \
-       x(bucket_gens,          30,     BKEY_TYPE_strict_btree_checks)  \
-       x(snapshot_tree,        31,     BKEY_TYPE_strict_btree_checks)  \
-       x(logged_op_truncate,   32,     BKEY_TYPE_strict_btree_checks)  \
-       x(logged_op_finsert,    33,     BKEY_TYPE_strict_btree_checks)  \
-       x(accounting,           34,     BKEY_TYPE_strict_btree_checks)  \
-       x(inode_alloc_cursor,   35,     BKEY_TYPE_strict_btree_checks)
-
-enum bch_bkey_type {
-#define x(name, nr, ...) KEY_TYPE_##name       = nr,
-       BCH_BKEY_TYPES()
-#undef x
-       KEY_TYPE_MAX,
-};
-
-struct bch_deleted {
-       struct bch_val          v;
-};
-
-struct bch_whiteout {
-       struct bch_val          v;
-};
-
-struct bch_error {
-       struct bch_val          v;
-};
-
-struct bch_cookie {
-       struct bch_val          v;
-       __le64                  cookie;
-};
-
-struct bch_hash_whiteout {
-       struct bch_val          v;
-};
-
-struct bch_set {
-       struct bch_val          v;
-};
-
-/* 128 bits, sufficient for cryptographic MACs: */
-struct bch_csum {
-       __le64                  lo;
-       __le64                  hi;
-} __packed __aligned(8);
-
-struct bch_backpointer {
-       struct bch_val          v;
-       __u8                    btree_id;
-       __u8                    level;
-       __u8                    data_type;
-       __u8                    bucket_gen;
-       __u32                   pad;
-       __u32                   bucket_len;
-       struct bpos             pos;
-} __packed __aligned(8);
-
-/* Optional/variable size superblock sections: */
-
-struct bch_sb_field {
-       __u64                   _data[0];
-       __le32                  u64s;
-       __le32                  type;
-};
-
-#define BCH_SB_FIELDS()                                \
-       x(journal,                      0)      \
-       x(members_v1,                   1)      \
-       x(crypt,                        2)      \
-       x(replicas_v0,                  3)      \
-       x(quota,                        4)      \
-       x(disk_groups,                  5)      \
-       x(clean,                        6)      \
-       x(replicas,                     7)      \
-       x(journal_seq_blacklist,        8)      \
-       x(journal_v2,                   9)      \
-       x(counters,                     10)     \
-       x(members_v2,                   11)     \
-       x(errors,                       12)     \
-       x(ext,                          13)     \
-       x(downgrade,                    14)     \
-       x(recovery_passes,              15)
-
-#include "alloc_background_format.h"
-#include "dirent_format.h"
-#include "disk_accounting_format.h"
-#include "disk_groups_format.h"
-#include "extents_format.h"
-#include "ec_format.h"
-#include "inode_format.h"
-#include "journal_seq_blacklist_format.h"
-#include "logged_ops_format.h"
-#include "lru_format.h"
-#include "quota_format.h"
-#include "recovery_passes_format.h"
-#include "reflink_format.h"
-#include "replicas_format.h"
-#include "snapshot_format.h"
-#include "subvolume_format.h"
-#include "sb-counters_format.h"
-#include "sb-downgrade_format.h"
-#include "sb-errors_format.h"
-#include "sb-members_format.h"
-#include "xattr_format.h"
-
-enum bch_sb_field_type {
-#define x(f, nr)       BCH_SB_FIELD_##f = nr,
-       BCH_SB_FIELDS()
-#undef x
-       BCH_SB_FIELD_NR
-};
-
-/*
- * Most superblock fields are replicated in all device's superblocks - a few are
- * not:
- */
-#define BCH_SINGLE_DEVICE_SB_FIELDS            \
-       ((1U << BCH_SB_FIELD_journal)|          \
-        (1U << BCH_SB_FIELD_journal_v2))
-
-/* BCH_SB_FIELD_journal: */
-
-struct bch_sb_field_journal {
-       struct bch_sb_field     field;
-       __le64                  buckets[];
-};
-
-struct bch_sb_field_journal_v2 {
-       struct bch_sb_field     field;
-
-       struct bch_sb_field_journal_v2_entry {
-               __le64          start;
-               __le64          nr;
-       }                       d[];
-};
-
-/* BCH_SB_FIELD_crypt: */
-
-struct nonce {
-       __le32                  d[4];
-};
-
-struct bch_key {
-       __le64                  key[4];
-};
-
-#define BCH_KEY_MAGIC                                  \
-       (((__u64) 'b' <<  0)|((__u64) 'c' <<  8)|               \
-        ((__u64) 'h' << 16)|((__u64) '*' << 24)|               \
-        ((__u64) '*' << 32)|((__u64) 'k' << 40)|               \
-        ((__u64) 'e' << 48)|((__u64) 'y' << 56))
-
-struct bch_encrypted_key {
-       __le64                  magic;
-       struct bch_key          key;
-};
-
-/*
- * If this field is present in the superblock, it stores an encryption key which
- * is used encrypt all other data/metadata. The key will normally be encrypted
- * with the key userspace provides, but if encryption has been turned off we'll
- * just store the master key unencrypted in the superblock so we can access the
- * previously encrypted data.
- */
-struct bch_sb_field_crypt {
-       struct bch_sb_field     field;
-
-       __le64                  flags;
-       __le64                  kdf_flags;
-       struct bch_encrypted_key key;
-};
-
-LE64_BITMASK(BCH_CRYPT_KDF_TYPE,       struct bch_sb_field_crypt, flags, 0, 4);
-
-enum bch_kdf_types {
-       BCH_KDF_SCRYPT          = 0,
-       BCH_KDF_NR              = 1,
-};
-
-/* stored as base 2 log of scrypt params: */
-LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags,  0, 16);
-LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
-LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
-
-/*
- * On clean shutdown, store btree roots and current journal sequence number in
- * the superblock:
- */
-struct jset_entry {
-       __le16                  u64s;
-       __u8                    btree_id;
-       __u8                    level;
-       __u8                    type; /* designates what this jset holds */
-       __u8                    pad[3];
-
-       struct bkey_i           start[0];
-       __u64                   _data[];
-};
-
-struct bch_sb_field_clean {
-       struct bch_sb_field     field;
-
-       __le32                  flags;
-       __le16                  _read_clock; /* no longer used */
-       __le16                  _write_clock;
-       __le64                  journal_seq;
-
-       struct jset_entry       start[0];
-       __u64                   _data[];
-};
-
-struct bch_sb_field_ext {
-       struct bch_sb_field     field;
-       __le64                  recovery_passes_required[2];
-       __le64                  errors_silent[8];
-       __le64                  btrees_lost_data;
-};
-
-/* Superblock: */
-
-/*
- * New versioning scheme:
- * One common version number for all on disk data structures - superblock, btree
- * nodes, journal entries
- */
-#define BCH_VERSION_MAJOR(_v)          ((__u16) ((_v) >> 10))
-#define BCH_VERSION_MINOR(_v)          ((__u16) ((_v) & ~(~0U << 10)))
-#define BCH_VERSION(_major, _minor)    (((_major) << 10)|(_minor) << 0)
-
-/*
- * field 1:            version name
- * field 2:            BCH_VERSION(major, minor)
- * field 3:            recovery passess required on upgrade
- */
-#define BCH_METADATA_VERSIONS()                                                \
-       x(bkey_renumber,                BCH_VERSION(0, 10))             \
-       x(inode_btree_change,           BCH_VERSION(0, 11))             \
-       x(snapshot,                     BCH_VERSION(0, 12))             \
-       x(inode_backpointers,           BCH_VERSION(0, 13))             \
-       x(btree_ptr_sectors_written,    BCH_VERSION(0, 14))             \
-       x(snapshot_2,                   BCH_VERSION(0, 15))             \
-       x(reflink_p_fix,                BCH_VERSION(0, 16))             \
-       x(subvol_dirent,                BCH_VERSION(0, 17))             \
-       x(inode_v2,                     BCH_VERSION(0, 18))             \
-       x(freespace,                    BCH_VERSION(0, 19))             \
-       x(alloc_v4,                     BCH_VERSION(0, 20))             \
-       x(new_data_types,               BCH_VERSION(0, 21))             \
-       x(backpointers,                 BCH_VERSION(0, 22))             \
-       x(inode_v3,                     BCH_VERSION(0, 23))             \
-       x(unwritten_extents,            BCH_VERSION(0, 24))             \
-       x(bucket_gens,                  BCH_VERSION(0, 25))             \
-       x(lru_v2,                       BCH_VERSION(0, 26))             \
-       x(fragmentation_lru,            BCH_VERSION(0, 27))             \
-       x(no_bps_in_alloc_keys,         BCH_VERSION(0, 28))             \
-       x(snapshot_trees,               BCH_VERSION(0, 29))             \
-       x(major_minor,                  BCH_VERSION(1,  0))             \
-       x(snapshot_skiplists,           BCH_VERSION(1,  1))             \
-       x(deleted_inodes,               BCH_VERSION(1,  2))             \
-       x(rebalance_work,               BCH_VERSION(1,  3))             \
-       x(member_seq,                   BCH_VERSION(1,  4))             \
-       x(subvolume_fs_parent,          BCH_VERSION(1,  5))             \
-       x(btree_subvolume_children,     BCH_VERSION(1,  6))             \
-       x(mi_btree_bitmap,              BCH_VERSION(1,  7))             \
-       x(bucket_stripe_sectors,        BCH_VERSION(1,  8))             \
-       x(disk_accounting_v2,           BCH_VERSION(1,  9))             \
-       x(disk_accounting_v3,           BCH_VERSION(1, 10))             \
-       x(disk_accounting_inum,         BCH_VERSION(1, 11))             \
-       x(rebalance_work_acct_fix,      BCH_VERSION(1, 12))             \
-       x(inode_has_child_snapshots,    BCH_VERSION(1, 13))             \
-       x(backpointer_bucket_gen,       BCH_VERSION(1, 14))             \
-       x(disk_accounting_big_endian,   BCH_VERSION(1, 15))             \
-       x(reflink_p_may_update_opts,    BCH_VERSION(1, 16))             \
-       x(inode_depth,                  BCH_VERSION(1, 17))             \
-       x(persistent_inode_cursors,     BCH_VERSION(1, 18))             \
-       x(autofix_errors,               BCH_VERSION(1, 19))             \
-       x(directory_size,               BCH_VERSION(1, 20))             \
-       x(cached_backpointers,          BCH_VERSION(1, 21))             \
-       x(stripe_backpointers,          BCH_VERSION(1, 22))             \
-       x(stripe_lru,                   BCH_VERSION(1, 23))             \
-       x(casefolding,                  BCH_VERSION(1, 24))             \
-       x(extent_flags,                 BCH_VERSION(1, 25))             \
-       x(snapshot_deletion_v2,         BCH_VERSION(1, 26))             \
-       x(fast_device_removal,          BCH_VERSION(1, 27))             \
-       x(inode_has_case_insensitive,   BCH_VERSION(1, 28))
-
-enum bcachefs_metadata_version {
-       bcachefs_metadata_version_min = 9,
-#define x(t, n)        bcachefs_metadata_version_##t = n,
-       BCH_METADATA_VERSIONS()
-#undef x
-       bcachefs_metadata_version_max
-};
-
-static const __maybe_unused
-unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
-
-#define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
-
-#define BCH_SB_SECTOR                  8
-
-#define BCH_SB_LAYOUT_SIZE_BITS_MAX    16 /* 32 MB */
-
-struct bch_sb_layout {
-       __uuid_t                magic;  /* bcachefs superblock UUID */
-       __u8                    layout_type;
-       __u8                    sb_max_size_bits; /* base 2 of 512 byte sectors */
-       __u8                    nr_superblocks;
-       __u8                    pad[5];
-       __le64                  sb_offset[61];
-} __packed __aligned(8);
-
-#define BCH_SB_LAYOUT_SECTOR   7
-
-/*
- * @offset     - sector where this sb was written
- * @version    - on disk format version
- * @version_min        - Oldest metadata version this filesystem contains; so we can
- *               safely drop compatibility code and refuse to mount filesystems
- *               we'd need it for
- * @magic      - identifies as a bcachefs superblock (BCHFS_MAGIC)
- * @seq                - incremented each time superblock is written
- * @uuid       - used for generating various magic numbers and identifying
- *                member devices, never changes
- * @user_uuid  - user visible UUID, may be changed
- * @label      - filesystem label
- * @seq                - identifies most recent superblock, incremented each time
- *               superblock is written
- * @features   - enabled incompatible features
- */
-struct bch_sb {
-       struct bch_csum         csum;
-       __le16                  version;
-       __le16                  version_min;
-       __le16                  pad[2];
-       __uuid_t                magic;
-       __uuid_t                uuid;
-       __uuid_t                user_uuid;
-       __u8                    label[BCH_SB_LABEL_SIZE];
-       __le64                  offset;
-       __le64                  seq;
-
-       __le16                  block_size;
-       __u8                    dev_idx;
-       __u8                    nr_devices;
-       __le32                  u64s;
-
-       __le64                  time_base_lo;
-       __le32                  time_base_hi;
-       __le32                  time_precision;
-
-       __le64                  flags[7];
-       __le64                  write_time;
-       __le64                  features[2];
-       __le64                  compat[2];
-
-       struct bch_sb_layout    layout;
-
-       struct bch_sb_field     start[0];
-       __le64                  _data[];
-} __packed __aligned(8);
-
-/*
- * Flags:
- * BCH_SB_INITALIZED   - set on first mount
- * BCH_SB_CLEAN                - did we shut down cleanly? Just a hint, doesn't affect
- *                       behaviour of mount/recovery path:
- * BCH_SB_INODE_32BIT  - limit inode numbers to 32 bits
- * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80
- * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
- *                        DATA/META_CSUM_TYPE. Also indicates encryption
- *                        algorithm in use, if/when we get more than one
- */
-
-LE16_BITMASK(BCH_SB_BLOCK_SIZE,                struct bch_sb, block_size, 0, 16);
-
-LE64_BITMASK(BCH_SB_INITIALIZED,       struct bch_sb, flags[0],  0,  1);
-LE64_BITMASK(BCH_SB_CLEAN,             struct bch_sb, flags[0],  1,  2);
-LE64_BITMASK(BCH_SB_CSUM_TYPE,         struct bch_sb, flags[0],  2,  8);
-LE64_BITMASK(BCH_SB_ERROR_ACTION,      struct bch_sb, flags[0],  8, 12);
-
-LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,   struct bch_sb, flags[0], 12, 28);
-
-LE64_BITMASK(BCH_SB_GC_RESERVE,                struct bch_sb, flags[0], 28, 33);
-LE64_BITMASK(BCH_SB_ROOT_RESERVE,      struct bch_sb, flags[0], 33, 40);
-
-LE64_BITMASK(BCH_SB_META_CSUM_TYPE,    struct bch_sb, flags[0], 40, 44);
-LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,    struct bch_sb, flags[0], 44, 48);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,        struct bch_sb, flags[0], 48, 52);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,        struct bch_sb, flags[0], 52, 56);
-
-LE64_BITMASK(BCH_SB_POSIX_ACL,         struct bch_sb, flags[0], 56, 57);
-LE64_BITMASK(BCH_SB_USRQUOTA,          struct bch_sb, flags[0], 57, 58);
-LE64_BITMASK(BCH_SB_GRPQUOTA,          struct bch_sb, flags[0], 58, 59);
-LE64_BITMASK(BCH_SB_PRJQUOTA,          struct bch_sb, flags[0], 59, 60);
-
-LE64_BITMASK(BCH_SB_HAS_ERRORS,                struct bch_sb, flags[0], 60, 61);
-LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
-
-LE64_BITMASK(BCH_SB_BIG_ENDIAN,                struct bch_sb, flags[0], 62, 63);
-LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
-                                       struct bch_sb, flags[0], 63, 64);
-
-LE64_BITMASK(BCH_SB_STR_HASH_TYPE,     struct bch_sb, flags[1],  0,  4);
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
-LE64_BITMASK(BCH_SB_INODE_32BIT,       struct bch_sb, flags[1],  8,  9);
-
-LE64_BITMASK(BCH_SB_128_BIT_MACS,      struct bch_sb, flags[1],  9, 10);
-LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,   struct bch_sb, flags[1], 10, 14);
-
-/*
- * Max size of an extent that may require bouncing to read or write
- * (checksummed, compressed): 64k
- */
-LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
-                                       struct bch_sb, flags[1], 14, 20);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
-
-LE64_BITMASK(BCH_SB_PROMOTE_TARGET,    struct bch_sb, flags[1], 28, 40);
-LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
-LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
-
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
-                                       struct bch_sb, flags[2],  0,  4);
-LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,  struct bch_sb, flags[2],  4, 64);
-
-LE64_BITMASK(BCH_SB_ERASURE_CODE,      struct bch_sb, flags[3],  0, 16);
-LE64_BITMASK(BCH_SB_METADATA_TARGET,   struct bch_sb, flags[3], 16, 28);
-LE64_BITMASK(BCH_SB_SHARD_INUMS,       struct bch_sb, flags[3], 28, 29);
-LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
-LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
-LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
-LE64_BITMASK(BCH_SB_MULTI_DEVICE,      struct bch_sb,  flags[3], 63, 64);
-LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
-LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
-LE64_BITMASK(BCH_SB_NOCOW,             struct bch_sb, flags[4], 33, 34);
-LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54);
-LE64_BITMASK(BCH_SB_VERSION_UPGRADE,   struct bch_sb, flags[4], 54, 56);
-
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
-                                       struct bch_sb, flags[4], 60, 64);
-
-LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
-                                       struct bch_sb, flags[5],  0, 16);
-LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
-                                       struct bch_sb, flags[5], 16, 32);
-LE64_BITMASK(BCH_SB_VERSION_INCOMPAT,  struct bch_sb, flags[5], 32, 48);
-LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
-                                       struct bch_sb, flags[5], 48, 64);
-LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6],  0,  4);
-LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6],  4, 14);
-LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
-LE64_BITMASK(BCH_SB_DEGRADED_ACTION,   struct bch_sb, flags[6], 20, 22);
-LE64_BITMASK(BCH_SB_CASEFOLD,          struct bch_sb, flags[6], 22, 23);
-LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24);
-
-static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
-{
-       return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
-}
-
-static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
-{
-       SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
-       SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
-}
-
-static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
-{
-       return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
-               (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
-}
-
-static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
-{
-       SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
-       SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
-}
-
-/*
- * Features:
- *
- * journal_seq_blacklist_v3:   gates BCH_SB_FIELD_journal_seq_blacklist
- * reflink:                    gates KEY_TYPE_reflink
- * inline_data:                        gates KEY_TYPE_inline_data
- * new_siphash:                        gates BCH_STR_HASH_siphash
- * new_extent_overwrite:       gates BTREE_NODE_NEW_EXTENT_OVERWRITE
- */
-#define BCH_SB_FEATURES()                      \
-       x(lz4,                          0)      \
-       x(gzip,                         1)      \
-       x(zstd,                         2)      \
-       x(atomic_nlink,                 3)      \
-       x(ec,                           4)      \
-       x(journal_seq_blacklist_v3,     5)      \
-       x(reflink,                      6)      \
-       x(new_siphash,                  7)      \
-       x(inline_data,                  8)      \
-       x(new_extent_overwrite,         9)      \
-       x(incompressible,               10)     \
-       x(btree_ptr_v2,                 11)     \
-       x(extents_above_btree_updates,  12)     \
-       x(btree_updates_journalled,     13)     \
-       x(reflink_inline_data,          14)     \
-       x(new_varint,                   15)     \
-       x(journal_no_flush,             16)     \
-       x(alloc_v2,                     17)     \
-       x(extents_across_btree_nodes,   18)     \
-       x(incompat_version_field,       19)     \
-       x(casefolding,                  20)     \
-       x(no_alloc_info,                21)     \
-       x(small_image,                  22)
-
-#define BCH_SB_FEATURES_ALWAYS                         \
-       (BIT_ULL(BCH_FEATURE_new_extent_overwrite)|     \
-        BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\
-        BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\
-        BIT_ULL(BCH_FEATURE_alloc_v2)|\
-        BIT_ULL(BCH_FEATURE_extents_across_btree_nodes))
-
-#define BCH_SB_FEATURES_ALL                            \
-       (BCH_SB_FEATURES_ALWAYS|                        \
-        BIT_ULL(BCH_FEATURE_new_siphash)|              \
-        BIT_ULL(BCH_FEATURE_btree_ptr_v2)|             \
-        BIT_ULL(BCH_FEATURE_new_varint)|               \
-        BIT_ULL(BCH_FEATURE_journal_no_flush)|         \
-        BIT_ULL(BCH_FEATURE_incompat_version_field))
-
-enum bch_sb_feature {
-#define x(f, n) BCH_FEATURE_##f,
-       BCH_SB_FEATURES()
-#undef x
-       BCH_FEATURE_NR,
-};
-
-#define BCH_SB_COMPAT()                                        \
-       x(alloc_info,                           0)      \
-       x(alloc_metadata,                       1)      \
-       x(extents_above_btree_updates_done,     2)      \
-       x(bformat_overflow_done,                3)
-
-enum bch_sb_compat {
-#define x(f, n) BCH_COMPAT_##f,
-       BCH_SB_COMPAT()
-#undef x
-       BCH_COMPAT_NR,
-};
-
-/* options: */
-
-#define BCH_VERSION_UPGRADE_OPTS()     \
-       x(compatible,           0)      \
-       x(incompatible,         1)      \
-       x(none,                 2)
-
-enum bch_version_upgrade_opts {
-#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
-       BCH_VERSION_UPGRADE_OPTS()
-#undef x
-};
-
-#define BCH_REPLICAS_MAX               4U
-
-#define BCH_BKEY_PTRS_MAX              16U
-
-#define BCH_ERROR_ACTIONS()            \
-       x(continue,             0)      \
-       x(fix_safe,             1)      \
-       x(panic,                2)      \
-       x(ro,                   3)
-
-enum bch_error_actions {
-#define x(t, n) BCH_ON_ERROR_##t = n,
-       BCH_ERROR_ACTIONS()
-#undef x
-       BCH_ON_ERROR_NR
-};
-
-#define BCH_DEGRADED_ACTIONS()         \
-       x(ask,                  0)      \
-       x(yes,                  1)      \
-       x(very,                 2)      \
-       x(no,                   3)
-
-enum bch_degraded_actions {
-#define x(t, n) BCH_DEGRADED_##t = n,
-       BCH_DEGRADED_ACTIONS()
-#undef x
-       BCH_DEGRADED_ACTIONS_NR
-};
-
-#define BCH_STR_HASH_TYPES()           \
-       x(crc32c,               0)      \
-       x(crc64,                1)      \
-       x(siphash_old,          2)      \
-       x(siphash,              3)
-
-enum bch_str_hash_type {
-#define x(t, n) BCH_STR_HASH_##t = n,
-       BCH_STR_HASH_TYPES()
-#undef x
-       BCH_STR_HASH_NR
-};
-
-#define BCH_STR_HASH_OPTS()            \
-       x(crc32c,               0)      \
-       x(crc64,                1)      \
-       x(siphash,              2)
-
-enum bch_str_hash_opts {
-#define x(t, n) BCH_STR_HASH_OPT_##t = n,
-       BCH_STR_HASH_OPTS()
-#undef x
-       BCH_STR_HASH_OPT_NR
-};
-
-#define BCH_CSUM_TYPES()                       \
-       x(none,                         0)      \
-       x(crc32c_nonzero,               1)      \
-       x(crc64_nonzero,                2)      \
-       x(chacha20_poly1305_80,         3)      \
-       x(chacha20_poly1305_128,        4)      \
-       x(crc32c,                       5)      \
-       x(crc64,                        6)      \
-       x(xxhash,                       7)
-
-enum bch_csum_type {
-#define x(t, n) BCH_CSUM_##t = n,
-       BCH_CSUM_TYPES()
-#undef x
-       BCH_CSUM_NR
-};
-
-static const __maybe_unused unsigned bch_crc_bytes[] = {
-       [BCH_CSUM_none]                         = 0,
-       [BCH_CSUM_crc32c_nonzero]               = 4,
-       [BCH_CSUM_crc32c]                       = 4,
-       [BCH_CSUM_crc64_nonzero]                = 8,
-       [BCH_CSUM_crc64]                        = 8,
-       [BCH_CSUM_xxhash]                       = 8,
-       [BCH_CSUM_chacha20_poly1305_80]         = 10,
-       [BCH_CSUM_chacha20_poly1305_128]        = 16,
-};
-
-static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
-{
-       switch (type) {
-       case BCH_CSUM_chacha20_poly1305_80:
-       case BCH_CSUM_chacha20_poly1305_128:
-               return true;
-       default:
-               return false;
-       }
-}
-
-#define BCH_CSUM_OPTS()                        \
-       x(none,                 0)      \
-       x(crc32c,               1)      \
-       x(crc64,                2)      \
-       x(xxhash,               3)
-
-enum bch_csum_opt {
-#define x(t, n) BCH_CSUM_OPT_##t = n,
-       BCH_CSUM_OPTS()
-#undef x
-       BCH_CSUM_OPT_NR
-};
-
-#define BCH_COMPRESSION_TYPES()                \
-       x(none,                 0)      \
-       x(lz4_old,              1)      \
-       x(gzip,                 2)      \
-       x(lz4,                  3)      \
-       x(zstd,                 4)      \
-       x(incompressible,       5)
-
-enum bch_compression_type {
-#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
-       BCH_COMPRESSION_TYPES()
-#undef x
-       BCH_COMPRESSION_TYPE_NR
-};
-
-#define BCH_COMPRESSION_OPTS()         \
-       x(none,         0)              \
-       x(lz4,          1)              \
-       x(gzip,         2)              \
-       x(zstd,         3)
-
-enum bch_compression_opts {
-#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
-       BCH_COMPRESSION_OPTS()
-#undef x
-       BCH_COMPRESSION_OPT_NR
-};
-
-/*
- * Magic numbers
- *
- * The various other data structures have their own magic numbers, which are
- * xored with the first part of the cache set's UUID
- */
-
-#define BCACHE_MAGIC                                                   \
-       UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca,                           \
-                 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
-#define BCHFS_MAGIC                                                    \
-       UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,                           \
-                 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-
-#define BCACHEFS_STATFS_MAGIC          BCACHEFS_SUPER_MAGIC
-
-#define JSET_MAGIC             __cpu_to_le64(0x245235c1a3625032ULL)
-#define BSET_MAGIC             __cpu_to_le64(0x90135c78b99e07f5ULL)
-
-static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
-{
-       __le64 ret;
-
-       memcpy(&ret, &sb->uuid, sizeof(ret));
-       return ret;
-}
-
-static inline __u64 __jset_magic(struct bch_sb *sb)
-{
-       return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
-}
-
-static inline __u64 __bset_magic(struct bch_sb *sb)
-{
-       return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
-}
-
-/* Journal */
-
-#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
-
-#define BCH_JSET_ENTRY_TYPES()                 \
-       x(btree_keys,           0)              \
-       x(btree_root,           1)              \
-       x(prio_ptrs,            2)              \
-       x(blacklist,            3)              \
-       x(blacklist_v2,         4)              \
-       x(usage,                5)              \
-       x(data_usage,           6)              \
-       x(clock,                7)              \
-       x(dev_usage,            8)              \
-       x(log,                  9)              \
-       x(overwrite,            10)             \
-       x(write_buffer_keys,    11)             \
-       x(datetime,             12)             \
-       x(log_bkey,             13)
-
-enum bch_jset_entry_type {
-#define x(f, nr)       BCH_JSET_ENTRY_##f      = nr,
-       BCH_JSET_ENTRY_TYPES()
-#undef x
-       BCH_JSET_ENTRY_NR
-};
-
-static inline bool jset_entry_is_key(struct jset_entry *e)
-{
-       switch (e->type) {
-       case BCH_JSET_ENTRY_btree_keys:
-       case BCH_JSET_ENTRY_btree_root:
-       case BCH_JSET_ENTRY_write_buffer_keys:
-               return true;
-       }
-
-       return false;
-}
-
-/*
- * Journal sequence numbers can be blacklisted: bsets record the max sequence
- * number of all the journal entries they contain updates for, so that on
- * recovery we can ignore those bsets that contain index updates newer that what
- * made it into the journal.
- *
- * This means that we can't reuse that journal_seq - we have to skip it, and
- * then record that we skipped it so that the next time we crash and recover we
- * don't think there was a missing journal entry.
- */
-struct jset_entry_blacklist {
-       struct jset_entry       entry;
-       __le64                  seq;
-};
-
-struct jset_entry_blacklist_v2 {
-       struct jset_entry       entry;
-       __le64                  start;
-       __le64                  end;
-};
-
-#define BCH_FS_USAGE_TYPES()                   \
-       x(reserved,             0)              \
-       x(inodes,               1)              \
-       x(key_version,          2)
-
-enum bch_fs_usage_type {
-#define x(f, nr)       BCH_FS_USAGE_##f        = nr,
-       BCH_FS_USAGE_TYPES()
-#undef x
-       BCH_FS_USAGE_NR
-};
-
-struct jset_entry_usage {
-       struct jset_entry       entry;
-       __le64                  v;
-} __packed;
-
-struct jset_entry_data_usage {
-       struct jset_entry       entry;
-       __le64                  v;
-       struct bch_replicas_entry_v1 r;
-} __packed;
-
-struct jset_entry_clock {
-       struct jset_entry       entry;
-       __u8                    rw;
-       __u8                    pad[7];
-       __le64                  time;
-} __packed;
-
-struct jset_entry_dev_usage_type {
-       __le64                  buckets;
-       __le64                  sectors;
-       __le64                  fragmented;
-} __packed;
-
-struct jset_entry_dev_usage {
-       struct jset_entry       entry;
-       __le32                  dev;
-       __u32                   pad;
-
-       __le64                  _buckets_ec;            /* No longer used */
-       __le64                  _buckets_unavailable;   /* No longer used */
-
-       struct jset_entry_dev_usage_type d[];
-};
-
-static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
-{
-       return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
-               sizeof(struct jset_entry_dev_usage_type);
-}
-
-struct jset_entry_log {
-       struct jset_entry       entry;
-       u8                      d[];
-} __packed __aligned(8);
-
-static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l)
-{
-       unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d);
-
-       while (b && !l->d[b - 1])
-               --b;
-       return b;
-}
-
-struct jset_entry_datetime {
-       struct jset_entry       entry;
-       __le64                  seconds;
-} __packed __aligned(8);
-
-/*
- * On disk format for a journal entry:
- * seq is monotonically increasing; every journal entry has its own unique
- * sequence number.
- *
- * last_seq is the oldest journal entry that still has keys the btree hasn't
- * flushed to disk yet.
- *
- * version is for on disk format changes.
- */
-struct jset {
-       struct bch_csum         csum;
-
-       __le64                  magic;
-       __le64                  seq;
-       __le32                  version;
-       __le32                  flags;
-
-       __le32                  u64s; /* size of d[] in u64s */
-
-       __u8                    encrypted_start[0];
-
-       __le16                  _read_clock; /* no longer used */
-       __le16                  _write_clock;
-
-       /* Sequence number of oldest dirty journal entry */
-       __le64                  last_seq;
-
-
-       struct jset_entry       start[0];
-       __u64                   _data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
-LE32_BITMASK(JSET_BIG_ENDIAN,  struct jset, flags, 4, 5);
-LE32_BITMASK(JSET_NO_FLUSH,    struct jset, flags, 5, 6);
-
-#define BCH_JOURNAL_BUCKETS_MIN                8
-
-/* Btree: */
-
-enum btree_id_flags {
-       BTREE_IS_extents        = BIT(0),
-       BTREE_IS_snapshots      = BIT(1),
-       BTREE_IS_snapshot_field = BIT(2),
-       BTREE_IS_data           = BIT(3),
-       BTREE_IS_write_buffer   = BIT(4),
-};
-
-#define BCH_BTREE_IDS()                                                                \
-       x(extents,              0,                                              \
-         BTREE_IS_extents|                                                     \
-         BTREE_IS_snapshots|                                                   \
-         BTREE_IS_data,                                                        \
-         BIT_ULL(KEY_TYPE_whiteout)|                                           \
-         BIT_ULL(KEY_TYPE_error)|                                              \
-         BIT_ULL(KEY_TYPE_cookie)|                                             \
-         BIT_ULL(KEY_TYPE_extent)|                                             \
-         BIT_ULL(KEY_TYPE_reservation)|                                        \
-         BIT_ULL(KEY_TYPE_reflink_p)|                                          \
-         BIT_ULL(KEY_TYPE_inline_data))                                        \
-       x(inodes,               1,                                              \
-         BTREE_IS_snapshots,                                                   \
-         BIT_ULL(KEY_TYPE_whiteout)|                                           \
-         BIT_ULL(KEY_TYPE_inode)|                                              \
-         BIT_ULL(KEY_TYPE_inode_v2)|                                           \
-         BIT_ULL(KEY_TYPE_inode_v3)|                                           \
-         BIT_ULL(KEY_TYPE_inode_generation))                                   \
-       x(dirents,              2,                                              \
-         BTREE_IS_snapshots,                                                   \
-         BIT_ULL(KEY_TYPE_whiteout)|                                           \
-         BIT_ULL(KEY_TYPE_hash_whiteout)|                                      \
-         BIT_ULL(KEY_TYPE_dirent))                                             \
-       x(xattrs,               3,                                              \
-         BTREE_IS_snapshots,                                                   \
-         BIT_ULL(KEY_TYPE_whiteout)|                                           \
-         BIT_ULL(KEY_TYPE_cookie)|                                             \
-         BIT_ULL(KEY_TYPE_hash_whiteout)|                                      \
-         BIT_ULL(KEY_TYPE_xattr))                                              \
-       x(alloc,                4,      0,                                      \
-         BIT_ULL(KEY_TYPE_alloc)|                                              \
-         BIT_ULL(KEY_TYPE_alloc_v2)|                                           \
-         BIT_ULL(KEY_TYPE_alloc_v3)|                                           \
-         BIT_ULL(KEY_TYPE_alloc_v4))                                           \
-       x(quotas,               5,      0,                                      \
-         BIT_ULL(KEY_TYPE_quota))                                              \
-       x(stripes,              6,      0,                                      \
-         BIT_ULL(KEY_TYPE_stripe))                                             \
-       x(reflink,              7,                                              \
-         BTREE_IS_extents|                                                     \
-         BTREE_IS_data,                                                        \
-         BIT_ULL(KEY_TYPE_reflink_v)|                                          \
-         BIT_ULL(KEY_TYPE_indirect_inline_data)|                               \
-         BIT_ULL(KEY_TYPE_error))                                              \
-       x(subvolumes,           8,      0,                                      \
-         BIT_ULL(KEY_TYPE_subvolume))                                          \
-       x(snapshots,            9,      0,                                      \
-         BIT_ULL(KEY_TYPE_snapshot))                                           \
-       x(lru,                  10,                                             \
-         BTREE_IS_write_buffer,                                                \
-         BIT_ULL(KEY_TYPE_set))                                                \
-       x(freespace,            11,                                             \
-         BTREE_IS_extents,                                                     \
-         BIT_ULL(KEY_TYPE_set))                                                \
-       x(need_discard,         12,     0,                                      \
-         BIT_ULL(KEY_TYPE_set))                                                \
-       x(backpointers,         13,                                             \
-         BTREE_IS_write_buffer,                                                \
-         BIT_ULL(KEY_TYPE_backpointer))                                        \
-       x(bucket_gens,          14,     0,                                      \
-         BIT_ULL(KEY_TYPE_bucket_gens))                                        \
-       x(snapshot_trees,       15,     0,                                      \
-         BIT_ULL(KEY_TYPE_snapshot_tree))                                      \
-       x(deleted_inodes,       16,                                             \
-         BTREE_IS_snapshot_field|                                              \
-         BTREE_IS_write_buffer,                                                \
-         BIT_ULL(KEY_TYPE_set))                                                \
-       x(logged_ops,           17,     0,                                      \
-         BIT_ULL(KEY_TYPE_logged_op_truncate)|                                 \
-         BIT_ULL(KEY_TYPE_logged_op_finsert)|                                  \
-         BIT_ULL(KEY_TYPE_inode_alloc_cursor))                                 \
-       x(rebalance_work,       18,                                             \
-         BTREE_IS_snapshot_field|                                              \
-         BTREE_IS_write_buffer,                                                \
-         BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))                       \
-       x(subvolume_children,   19,     0,                                      \
-         BIT_ULL(KEY_TYPE_set))                                                \
-       x(accounting,           20,                                             \
-         BTREE_IS_snapshot_field|                                              \
-         BTREE_IS_write_buffer,                                                \
-         BIT_ULL(KEY_TYPE_accounting))                                         \
-
-enum btree_id {
-#define x(name, nr, ...) BTREE_ID_##name = nr,
-       BCH_BTREE_IDS()
-#undef x
-       BTREE_ID_NR
-};
-
-/*
- * Maximum number of btrees that we will _ever_ have under the current scheme,
- * where we refer to them with 64 bit bitfields - and we also need a bit for
- * the interior btree node type:
- */
-#define BTREE_ID_NR_MAX                63
-
-static inline bool btree_id_is_alloc(enum btree_id id)
-{
-       switch (id) {
-       case BTREE_ID_alloc:
-       case BTREE_ID_backpointers:
-       case BTREE_ID_need_discard:
-       case BTREE_ID_freespace:
-       case BTREE_ID_bucket_gens:
-       case BTREE_ID_lru:
-       case BTREE_ID_accounting:
-               return true;
-       default:
-               return false;
-       }
-}
-
-#define BTREE_MAX_DEPTH                4U
-
-/* Btree nodes */
-
-/*
- * Btree nodes
- *
- * On disk a btree node is a list/log of these; within each set the keys are
- * sorted
- */
-struct bset {
-       __le64                  seq;
-
-       /*
-        * Highest journal entry this bset contains keys for.
-        * If on recovery we don't see that journal entry, this bset is ignored:
-        * this allows us to preserve the order of all index updates after a
-        * crash, since the journal records a total order of all index updates
-        * and anything that didn't make it to the journal doesn't get used.
-        */
-       __le64                  journal_seq;
-
-       __le32                  flags;
-       __le16                  version;
-       __le16                  u64s; /* count of d[] in u64s */
-
-       struct bkey_packed      start[0];
-       __u64                   _data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BSET_CSUM_TYPE,   struct bset, flags, 0, 4);
-
-LE32_BITMASK(BSET_BIG_ENDIAN,  struct bset, flags, 4, 5);
-LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
-                               struct bset, flags, 5, 6);
-
-/* Sector offset within the btree node: */
-LE32_BITMASK(BSET_OFFSET,      struct bset, flags, 16, 32);
-
-struct btree_node {
-       struct bch_csum         csum;
-       __le64                  magic;
-
-       /* this flags field is encrypted, unlike bset->flags: */
-       __le64                  flags;
-
-       /* Closed interval: */
-       struct bpos             min_key;
-       struct bpos             max_key;
-       struct bch_extent_ptr   _ptr; /* not used anymore */
-       struct bkey_format      format;
-
-       union {
-       struct bset             keys;
-       struct {
-               __u8            pad[22];
-               __le16          u64s;
-               __u64           _data[0];
-
-       };
-       };
-} __packed __aligned(8);
-
-LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags,  0,  4);
-LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags,  4,  8);
-LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
-                               struct btree_node, flags,  8,  9);
-LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags,  9, 25);
-/* 25-32 unused */
-LE64_BITMASK(BTREE_NODE_SEQ,   struct btree_node, flags, 32, 64);
-
-static inline __u64 BTREE_NODE_ID(struct btree_node *n)
-{
-       return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
-}
-
-static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
-{
-       SET_BTREE_NODE_ID_LO(n, v);
-       SET_BTREE_NODE_ID_HI(n, v >> 4);
-}
-
-struct btree_node_entry {
-       struct bch_csum         csum;
-
-       union {
-       struct bset             keys;
-       struct {
-               __u8            pad[22];
-               __le16          u64s;
-               __u64           _data[0];
-       };
-       };
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
deleted file mode 100644 (file)
index 52594e9..0000000
+++ /dev/null
@@ -1,473 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IOCTL_H
-#define _BCACHEFS_IOCTL_H
-
-#include <linux/uuid.h>
-#include <asm/ioctl.h>
-#include "bcachefs_format.h"
-#include "bkey_types.h"
-
-/*
- * Flags common to multiple ioctls:
- */
-#define BCH_FORCE_IF_DATA_LOST         (1 << 0)
-#define BCH_FORCE_IF_METADATA_LOST     (1 << 1)
-#define BCH_FORCE_IF_DATA_DEGRADED     (1 << 2)
-#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
-
-#define BCH_FORCE_IF_LOST                      \
-       (BCH_FORCE_IF_DATA_LOST|                \
-        BCH_FORCE_IF_METADATA_LOST)
-#define BCH_FORCE_IF_DEGRADED                  \
-       (BCH_FORCE_IF_DATA_DEGRADED|            \
-        BCH_FORCE_IF_METADATA_DEGRADED)
-
-/*
- * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
- * (e.g. /dev/sda1); if set, the dev field is the device's index within the
- * filesystem:
- */
-#define BCH_BY_INDEX                   (1 << 4)
-
-/*
- * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
- * wide superblock:
- */
-#define BCH_READ_DEV                   (1 << 5)
-
-/* global control dev: */
-
-/* These are currently broken, and probably unnecessary: */
-#if 0
-#define BCH_IOCTL_ASSEMBLE     _IOW(0xbc, 1, struct bch_ioctl_assemble)
-#define BCH_IOCTL_INCREMENTAL  _IOW(0xbc, 2, struct bch_ioctl_incremental)
-
-struct bch_ioctl_assemble {
-       __u32                   flags;
-       __u32                   nr_devs;
-       __u64                   pad;
-       __u64                   devs[];
-};
-
-struct bch_ioctl_incremental {
-       __u32                   flags;
-       __u64                   pad;
-       __u64                   dev;
-};
-#endif
-
-/* filesystem ioctls: */
-
-#define BCH_IOCTL_QUERY_UUID   _IOR(0xbc,      1,  struct bch_ioctl_query_uuid)
-
-/* These only make sense when we also have incremental assembly */
-#if 0
-#define BCH_IOCTL_START                _IOW(0xbc,      2,  struct bch_ioctl_start)
-#define BCH_IOCTL_STOP         _IO(0xbc,       3)
-#endif
-
-#define BCH_IOCTL_DISK_ADD     _IOW(0xbc,      4,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_REMOVE  _IOW(0xbc,      5,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_ONLINE  _IOW(0xbc,      6,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc,      7,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,    8,  struct bch_ioctl_disk_set_state)
-#define BCH_IOCTL_DATA         _IOW(0xbc,      10, struct bch_ioctl_data)
-#define BCH_IOCTL_FS_USAGE     _IOWR(0xbc,     11, struct bch_ioctl_fs_usage)
-#define BCH_IOCTL_DEV_USAGE    _IOWR(0xbc,     11, struct bch_ioctl_dev_usage)
-#define BCH_IOCTL_READ_SUPER   _IOW(0xbc,      12, struct bch_ioctl_read_super)
-#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc,      13,  struct bch_ioctl_disk_get_idx)
-#define BCH_IOCTL_DISK_RESIZE  _IOW(0xbc,      14,  struct bch_ioctl_disk_resize)
-#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
-
-#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc,  16,  struct bch_ioctl_subvolume)
-#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17,  struct bch_ioctl_subvolume)
-
-#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc,     18, struct bch_ioctl_dev_usage_v2)
-
-#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc,      19,  struct bch_ioctl_fsck_offline)
-#define BCH_IOCTL_FSCK_ONLINE  _IOW(0xbc,      20,  struct bch_ioctl_fsck_online)
-#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc,  21,  struct bch_ioctl_query_accounting)
-#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc,    21,  struct bch_ioctl_query_counters)
-
-/* ioctl below act on a particular file, not the filesystem as a whole: */
-
-#define BCHFS_IOC_REINHERIT_ATTRS      _IOR(0xbc, 64, const char __user *)
-
-/*
- * BCH_IOCTL_QUERY_UUID: get filesystem UUID
- *
- * Returns user visible UUID, not internal UUID (which may not ever be changed);
- * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
- * this UUID.
- */
-struct bch_ioctl_query_uuid {
-       __uuid_t                uuid;
-};
-
-#if 0
-struct bch_ioctl_start {
-       __u32                   flags;
-       __u32                   pad;
-};
-#endif
-
-/*
- * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
- *
- * The specified device must not be open or in use. On success, the new device
- * will be an online member of the filesystem just like any other member.
- *
- * The device must first be prepared by userspace by formatting with a bcachefs
- * superblock, which is only used for passing in superblock options/parameters
- * for that device (in struct bch_member). The new device's superblock should
- * not claim to be a member of any existing filesystem - UUIDs on it will be
- * ignored.
- */
-
-/*
- * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
- *
- * Any data present on @dev will be permanently deleted, and @dev will be
- * removed from its slot in the filesystem's list of member devices. The device
- * may be either offline or offline.
- *
- * Will fail removing @dev would leave us with insufficient read write devices
- * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
- * set.
- */
-
-/*
- * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
- * but is not open (e.g. because we started in degraded mode), bring it online
- *
- * all existing data on @dev will be available once the device is online,
- * exactly as if @dev was present when the filesystem was first mounted
- */
-
-/*
- * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
- * block device, without removing it from the filesystem (so it can be brought
- * back online later)
- *
- * Data present on @dev will be unavailable while @dev is offline (unless
- * replicated), but will still be intact and untouched if @dev is brought back
- * online
- *
- * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
- * leave us with insufficient read write devices or degraded/unavailable data,
- * unless the approprate BCH_FORCE_IF_* flags are set.
- */
-
-struct bch_ioctl_disk {
-       __u32                   flags;
-       __u32                   pad;
-       __u64                   dev;
-};
-
-/*
- * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
- *
- * @new_state          - one of the bch_member_state states (rw, ro, failed,
- *                       spare)
- *
- * Will refuse to change member state if we would then have insufficient devices
- * to write to, or if it would result in degraded data (when @new_state is
- * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
- */
-struct bch_ioctl_disk_set_state {
-       __u32                   flags;
-       __u8                    new_state;
-       __u8                    pad[3];
-       __u64                   dev;
-};
-
-#define BCH_DATA_OPS()                 \
-       x(scrub,                0)      \
-       x(rereplicate,          1)      \
-       x(migrate,              2)      \
-       x(rewrite_old_nodes,    3)      \
-       x(drop_extra_replicas,  4)
-
-enum bch_data_ops {
-#define x(t, n) BCH_DATA_OP_##t = n,
-       BCH_DATA_OPS()
-#undef x
-       BCH_DATA_OP_NR
-};
-
-/*
- * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
- * scrub, rereplicate, migrate).
- *
- * This ioctl kicks off a job in the background, and returns a file descriptor.
- * Reading from the file descriptor returns a struct bch_ioctl_data_event,
- * indicating current progress, and closing the file descriptor will stop the
- * job. The file descriptor is O_CLOEXEC.
- */
-struct bch_ioctl_data {
-       __u16                   op;
-       __u8                    start_btree;
-       __u8                    end_btree;
-       __u32                   flags;
-
-       struct bpos             start_pos;
-       struct bpos             end_pos;
-
-       union {
-       struct {
-               __u32           dev;
-               __u32           data_types;
-       }                       scrub;
-       struct {
-               __u32           dev;
-               __u32           pad;
-       }                       migrate;
-       struct {
-               __u64           pad[8];
-       };
-       };
-} __packed __aligned(8);
-
-enum bch_data_event {
-       BCH_DATA_EVENT_PROGRESS = 0,
-       /* XXX: add an event for reporting errors */
-       BCH_DATA_EVENT_NR       = 1,
-};
-
-enum data_progress_data_type_special {
-       DATA_PROGRESS_DATA_TYPE_phys    = 254,
-       DATA_PROGRESS_DATA_TYPE_done    = 255,
-};
-
-struct bch_ioctl_data_progress {
-       __u8                    data_type;
-       __u8                    btree_id;
-       __u8                    pad[2];
-       struct bpos             pos;
-
-       __u64                   sectors_done;
-       __u64                   sectors_total;
-       __u64                   sectors_error_corrected;
-       __u64                   sectors_error_uncorrected;
-} __packed __aligned(8);
-
-enum bch_ioctl_data_event_ret {
-       BCH_IOCTL_DATA_EVENT_RET_done           = 1,
-       BCH_IOCTL_DATA_EVENT_RET_device_offline = 2,
-};
-
-struct bch_ioctl_data_event {
-       __u8                    type;
-       __u8                    ret;
-       __u8                    pad[6];
-       union {
-       struct bch_ioctl_data_progress p;
-       __u64                   pad2[15];
-       };
-} __packed __aligned(8);
-
-struct bch_replicas_usage {
-       __u64                   sectors;
-       struct bch_replicas_entry_v1 r;
-} __packed;
-
-static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
-{
-       return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
-}
-
-static inline struct bch_replicas_usage *
-replicas_usage_next(struct bch_replicas_usage *u)
-{
-       return (void *) u + replicas_usage_bytes(u);
-}
-
-/* Obsolete */
-/*
- * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
- *
- * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
- *
- * On success, @replica_entries_bytes will be changed to indicate the number of
- * bytes actually used.
- *
- * Returns -ERANGE if @replica_entries_bytes was too small
- */
-struct bch_ioctl_fs_usage {
-       __u64                   capacity;
-       __u64                   used;
-       __u64                   online_reserved;
-       __u64                   persistent_reserved[BCH_REPLICAS_MAX];
-
-       __u32                   replica_entries_bytes;
-       __u32                   pad;
-
-       struct bch_replicas_usage replicas[];
-};
-
-/* Obsolete */
-/*
- * BCH_IOCTL_DEV_USAGE: query device disk space usage
- *
- * Returns disk space usage broken out by data type - both by buckets and
- * sectors.
- */
-struct bch_ioctl_dev_usage {
-       __u64                   dev;
-       __u32                   flags;
-       __u8                    state;
-       __u8                    pad[7];
-
-       __u32                   bucket_size;
-       __u64                   nr_buckets;
-
-       __u64                   buckets_ec;
-
-       struct bch_ioctl_dev_usage_type {
-               __u64           buckets;
-               __u64           sectors;
-               __u64           fragmented;
-       }                       d[10];
-};
-
-/* Obsolete */
-struct bch_ioctl_dev_usage_v2 {
-       __u64                   dev;
-       __u32                   flags;
-       __u8                    state;
-       __u8                    nr_data_types;
-       __u8                    pad[6];
-
-       __u32                   bucket_size;
-       __u64                   nr_buckets;
-
-       struct bch_ioctl_dev_usage_type d[];
-};
-
-/*
- * BCH_IOCTL_READ_SUPER: read filesystem superblock
- *
- * Equivalent to reading the superblock directly from the block device, except
- * avoids racing with the kernel writing the superblock or having to figure out
- * which block device to read
- *
- * @sb         - buffer to read into
- * @size       - size of userspace allocated buffer
- * @dev                - device to read superblock for, if BCH_READ_DEV flag is
- *               specified
- *
- * Returns -ERANGE if buffer provided is too small
- */
-struct bch_ioctl_read_super {
-       __u32                   flags;
-       __u32                   pad;
-       __u64                   dev;
-       __u64                   size;
-       __u64                   sb;
-};
-
-/*
- * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
- * determine if disk is a (online) member - if so, returns device's index
- *
- * Returns -ENOENT if not found
- */
-struct bch_ioctl_disk_get_idx {
-       __u64                   dev;
-};
-
-/*
- * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
- *
- * @dev                - member to resize
- * @nbuckets   - new number of buckets
- */
-struct bch_ioctl_disk_resize {
-       __u32                   flags;
-       __u32                   pad;
-       __u64                   dev;
-       __u64                   nbuckets;
-};
-
-/*
- * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
- *
- * @dev                - member to resize
- * @nbuckets   - new number of buckets
- */
-struct bch_ioctl_disk_resize_journal {
-       __u32                   flags;
-       __u32                   pad;
-       __u64                   dev;
-       __u64                   nbuckets;
-};
-
-struct bch_ioctl_subvolume {
-       __u32                   flags;
-       __u32                   dirfd;
-       __u16                   mode;
-       __u16                   pad[3];
-       __u64                   dst_ptr;
-       __u64                   src_ptr;
-};
-
-#define BCH_SUBVOL_SNAPSHOT_CREATE     (1U << 0)
-#define BCH_SUBVOL_SNAPSHOT_RO         (1U << 1)
-
-/*
- * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command,
- * but with the kernel's implementation of fsck:
- */
-struct bch_ioctl_fsck_offline {
-       __u64                   flags;
-       __u64                   opts;           /* string */
-       __u64                   nr_devs;
-       __u64                   devs[] __counted_by(nr_devs);
-};
-
-/*
- * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command,
- * but with the kernel's implementation of fsck:
- */
-struct bch_ioctl_fsck_online {
-       __u64                   flags;
-       __u64                   opts;           /* string */
-};
-
-/*
- * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
- *
- * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
- *
- * On success, @replica_entries_bytes will be changed to indicate the number of
- * bytes actually used.
- *
- * Returns -ERANGE if @replica_entries_bytes was too small
- */
-struct bch_ioctl_query_accounting {
-       __u64                   capacity;
-       __u64                   used;
-       __u64                   online_reserved;
-
-       __u32                   accounting_u64s; /* input parameter */
-       __u32                   accounting_types_mask; /* input parameter */
-
-       struct bkey_i_accounting accounting[];
-};
-
-#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0)
-
-struct bch_ioctl_query_counters {
-       __u16                   nr;
-       __u16                   flags;
-       __u32                   pad;
-       __u64                   d[];
-};
-
-#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
deleted file mode 100644 (file)
index ee823c6..0000000
+++ /dev/null
@@ -1,1112 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "bkey_cmp.h"
-#include "bkey_methods.h"
-#include "bset.h"
-#include "util.h"
-
-const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
-
-void bch2_bkey_packed_to_binary_text(struct printbuf *out,
-                                    const struct bkey_format *f,
-                                    const struct bkey_packed *k)
-{
-       const u64 *p = high_word(f, k);
-       unsigned word_bits = 64 - high_bit_offset;
-       unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
-       u64 v = *p & (~0ULL >> high_bit_offset);
-
-       if (!nr_key_bits) {
-               prt_str(out, "(empty)");
-               return;
-       }
-
-       while (1) {
-               unsigned next_key_bits = nr_key_bits;
-
-               if (nr_key_bits < 64) {
-                       v >>= 64 - nr_key_bits;
-                       next_key_bits = 0;
-               } else {
-                       next_key_bits -= 64;
-               }
-
-               bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
-
-               if (!next_key_bits)
-                       break;
-
-               prt_char(out, ' ');
-
-               p = next_word(p);
-               v = *p;
-               word_bits = 64;
-               nr_key_bits = next_key_bits;
-       }
-}
-
-static void __bch2_bkey_pack_verify(const struct bkey_packed *packed,
-                                   const struct bkey *unpacked,
-                                   const struct bkey_format *format)
-{
-       struct bkey tmp;
-
-       BUG_ON(bkeyp_val_u64s(format, packed) !=
-              bkey_val_u64s(unpacked));
-
-       BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
-
-       tmp = __bch2_bkey_unpack_key(format, packed);
-
-       if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
-               struct printbuf buf = PRINTBUF;
-
-               prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
-                     format->key_u64s,
-                     format->bits_per_field[0],
-                     format->bits_per_field[1],
-                     format->bits_per_field[2],
-                     format->bits_per_field[3],
-                     format->bits_per_field[4]);
-
-               prt_printf(&buf, "compiled unpack: ");
-               bch2_bkey_to_text(&buf, unpacked);
-               prt_newline(&buf);
-
-               prt_printf(&buf, "c unpack:        ");
-               bch2_bkey_to_text(&buf, &tmp);
-               prt_newline(&buf);
-
-               prt_printf(&buf, "compiled unpack: ");
-               bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
-                                               (struct bkey_packed *) unpacked);
-               prt_newline(&buf);
-
-               prt_printf(&buf, "c unpack:        ");
-               bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
-                                               (struct bkey_packed *) &tmp);
-               prt_newline(&buf);
-
-               panic("%s", buf.buf);
-       }
-}
-
-static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
-                                        const struct bkey *unpacked,
-                                        const struct bkey_format *format)
-{
-       if (static_branch_unlikely(&bch2_debug_check_bkey_unpack))
-               __bch2_bkey_pack_verify(packed, unpacked, format);
-}
-
-struct pack_state {
-       const struct bkey_format *format;
-       unsigned                bits;   /* bits remaining in current word */
-       u64                     w;      /* current word */
-       u64                     *p;     /* pointer to next word */
-};
-
-__always_inline
-static struct pack_state pack_state_init(const struct bkey_format *format,
-                                        struct bkey_packed *k)
-{
-       u64 *p = high_word(format, k);
-
-       return (struct pack_state) {
-               .format = format,
-               .bits   = 64 - high_bit_offset,
-               .w      = 0,
-               .p      = p,
-       };
-}
-
-__always_inline
-static void pack_state_finish(struct pack_state *state,
-                             struct bkey_packed *k)
-{
-       EBUG_ON(state->p <  k->_data);
-       EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
-
-       *state->p = state->w;
-}
-
-struct unpack_state {
-       const struct bkey_format *format;
-       unsigned                bits;   /* bits remaining in current word */
-       u64                     w;      /* current word */
-       const u64               *p;     /* pointer to next word */
-};
-
-__always_inline
-static struct unpack_state unpack_state_init(const struct bkey_format *format,
-                                            const struct bkey_packed *k)
-{
-       const u64 *p = high_word(format, k);
-
-       return (struct unpack_state) {
-               .format = format,
-               .bits   = 64 - high_bit_offset,
-               .w      = *p << high_bit_offset,
-               .p      = p,
-       };
-}
-
-__always_inline
-static u64 get_inc_field(struct unpack_state *state, unsigned field)
-{
-       unsigned bits = state->format->bits_per_field[field];
-       u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
-
-       if (bits >= state->bits) {
-               v = state->w >> (64 - bits);
-               bits -= state->bits;
-
-               state->p = next_word(state->p);
-               state->w = *state->p;
-               state->bits = 64;
-       }
-
-       /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
-       v |= (state->w >> 1) >> (63 - bits);
-       state->w <<= bits;
-       state->bits -= bits;
-
-       return v + offset;
-}
-
-__always_inline
-static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
-       unsigned bits = state->format->bits_per_field[field];
-
-       if (bits) {
-               if (bits > state->bits) {
-                       bits -= state->bits;
-                       /* avoid shift by 64 if bits is 64 - bits is never 0 here: */
-                       state->w |= (v >> 1) >> (bits - 1);
-
-                       *state->p = state->w;
-                       state->p = next_word(state->p);
-                       state->w = 0;
-                       state->bits = 64;
-               }
-
-               state->bits -= bits;
-               state->w |= v << state->bits;
-       }
-}
-
-__always_inline
-static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
-       unsigned bits = state->format->bits_per_field[field];
-       u64 offset = le64_to_cpu(state->format->field_offset[field]);
-
-       if (v < offset)
-               return false;
-
-       v -= offset;
-
-       if (fls64(v) > bits)
-               return false;
-
-       __set_inc_field(state, field, v);
-       return true;
-}
-
-/*
- * Note: does NOT set out->format (we don't know what it should be here!)
- *
- * Also: doesn't work on extents - it doesn't preserve the invariant that
- * if k is packed bkey_start_pos(k) will successfully pack
- */
-static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
-                                  struct bkey_packed *out,
-                                  const struct bkey_format *in_f,
-                                  const struct bkey_packed *in)
-{
-       struct pack_state out_s = pack_state_init(out_f, out);
-       struct unpack_state in_s = unpack_state_init(in_f, in);
-       u64 *w = out->_data;
-       unsigned i;
-
-       *w = 0;
-
-       for (i = 0; i < BKEY_NR_FIELDS; i++)
-               if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
-                       return false;
-
-       /* Can't happen because the val would be too big to unpack: */
-       EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
-
-       pack_state_finish(&out_s, out);
-       out->u64s       = out_f->key_u64s + in->u64s - in_f->key_u64s;
-       out->needs_whiteout = in->needs_whiteout;
-       out->type       = in->type;
-
-       return true;
-}
-
-bool bch2_bkey_transform(const struct bkey_format *out_f,
-                       struct bkey_packed *out,
-                       const struct bkey_format *in_f,
-                       const struct bkey_packed *in)
-{
-       if (!bch2_bkey_transform_key(out_f, out, in_f, in))
-               return false;
-
-       memcpy_u64s((u64 *) out + out_f->key_u64s,
-                   (u64 *) in + in_f->key_u64s,
-                   (in->u64s - in_f->key_u64s));
-       return true;
-}
-
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
-                             const struct bkey_packed *in)
-{
-       struct unpack_state state = unpack_state_init(format, in);
-       struct bkey out;
-
-       EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-       EBUG_ON(in->u64s < format->key_u64s);
-       EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-       EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
-
-       out.u64s        = BKEY_U64s + in->u64s - format->key_u64s;
-       out.format      = KEY_FORMAT_CURRENT;
-       out.needs_whiteout = in->needs_whiteout;
-       out.type        = in->type;
-       out.pad[0]      = 0;
-
-#define x(id, field)   out.field = get_inc_field(&state, id);
-       bkey_fields()
-#undef x
-
-       return out;
-}
-
-#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *format,
-                                    const struct bkey_packed *in)
-{
-       struct unpack_state state = unpack_state_init(format, in);
-       struct bpos out;
-
-       EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-       EBUG_ON(in->u64s < format->key_u64s);
-       EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-
-       out.inode       = get_inc_field(&state, BKEY_FIELD_INODE);
-       out.offset      = get_inc_field(&state, BKEY_FIELD_OFFSET);
-       out.snapshot    = get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
-
-       return out;
-}
-#endif
-
-/**
- * bch2_bkey_pack_key -- pack just the key, not the value
- * @out:       packed result
- * @in:                key to pack
- * @format:    format of packed result
- *
- * Returns: true on success, false on failure
- */
-bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
-                       const struct bkey_format *format)
-{
-       struct pack_state state = pack_state_init(format, out);
-       u64 *w = out->_data;
-
-       EBUG_ON((void *) in == (void *) out);
-       EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-       EBUG_ON(in->format != KEY_FORMAT_CURRENT);
-
-       *w = 0;
-
-#define x(id, field)   if (!set_inc_field(&state, id, in->field)) return false;
-       bkey_fields()
-#undef x
-       pack_state_finish(&state, out);
-       out->u64s       = format->key_u64s + in->u64s - BKEY_U64s;
-       out->format     = KEY_FORMAT_LOCAL_BTREE;
-       out->needs_whiteout = in->needs_whiteout;
-       out->type       = in->type;
-
-       bch2_bkey_pack_verify(out, in, format);
-       return true;
-}
-
-/**
- * bch2_bkey_unpack -- unpack the key and the value
- * @b:         btree node of @src key (for packed format)
- * @dst:       unpacked result
- * @src:       packed input
- */
-void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
-                     const struct bkey_packed *src)
-{
-       __bkey_unpack_key(b, &dst->k, src);
-
-       memcpy_u64s(&dst->v,
-                   bkeyp_val(&b->format, src),
-                   bkeyp_val_u64s(&b->format, src));
-}
-
-/**
- * bch2_bkey_pack -- pack the key and the value
- * @dst:       packed result
- * @src:       unpacked input
- * @format:    format of packed result
- *
- * Returns: true on success, false on failure
- */
-bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
-                   const struct bkey_format *format)
-{
-       struct bkey_packed tmp;
-
-       if (!bch2_bkey_pack_key(&tmp, &src->k, format))
-               return false;
-
-       memmove_u64s((u64 *) dst + format->key_u64s,
-                    &src->v,
-                    bkey_val_u64s(&src->k));
-       memcpy_u64s_small(dst, &tmp, format->key_u64s);
-
-       return true;
-}
-
-__always_inline
-static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
-{
-       unsigned bits = state->format->bits_per_field[field];
-       u64 offset = le64_to_cpu(state->format->field_offset[field]);
-       bool ret = true;
-
-       EBUG_ON(v < offset);
-       v -= offset;
-
-       if (fls64(v) > bits) {
-               v = ~(~0ULL << bits);
-               ret = false;
-       }
-
-       __set_inc_field(state, field, v);
-       return ret;
-}
-
-static bool bkey_packed_successor(struct bkey_packed *out,
-                                 const struct btree *b,
-                                 struct bkey_packed k)
-{
-       const struct bkey_format *f = &b->format;
-       unsigned nr_key_bits = b->nr_key_bits;
-       unsigned first_bit, offset;
-       u64 *p;
-
-       EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
-       if (!nr_key_bits)
-               return false;
-
-       *out = k;
-
-       first_bit = high_bit_offset + nr_key_bits - 1;
-       p = nth_word(high_word(f, out), first_bit >> 6);
-       offset = 63 - (first_bit & 63);
-
-       while (nr_key_bits) {
-               unsigned bits = min(64 - offset, nr_key_bits);
-               u64 mask = (~0ULL >> (64 - bits)) << offset;
-
-               if ((*p & mask) != mask) {
-                       *p += 1ULL << offset;
-                       EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
-                       return true;
-               }
-
-               *p &= ~mask;
-               p = prev_word(p);
-               nr_key_bits -= bits;
-               offset = 0;
-       }
-
-       return false;
-}
-
-static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
-{
-       for (unsigned i = 0; i < f->nr_fields; i++) {
-               unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-               u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-               u64 packed_max = f->bits_per_field[i]
-                       ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
-                       : 0;
-               u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
-               if (packed_max + field_offset < packed_max ||
-                   packed_max + field_offset > unpacked_max)
-                       return true;
-       }
-
-       return false;
-}
-
-/*
- * Returns a packed key that compares <= in
- *
- * This is used in bset_search_tree(), where we need a packed pos in order to be
- * able to compare against the keys in the auxiliary search tree - and it's
- * legal to use a packed pos that isn't equivalent to the original pos,
- * _provided_ it compares <= to the original pos.
- */
-enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
-                                          struct bpos in,
-                                          const struct btree *b)
-{
-       const struct bkey_format *f = &b->format;
-       struct pack_state state = pack_state_init(f, out);
-       u64 *w = out->_data;
-       struct bpos orig = in;
-       bool exact = true;
-       unsigned i;
-
-       /*
-        * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
-        * byte header, but pack_pos() won't if the len/version fields are big
-        * enough - we need to make sure to zero them out:
-        */
-       for (i = 0; i < f->key_u64s; i++)
-               w[i] = 0;
-
-       if (unlikely(in.snapshot <
-                    le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
-               if (!in.offset-- &&
-                   !in.inode--)
-                       return BKEY_PACK_POS_FAIL;
-               in.snapshot     = KEY_SNAPSHOT_MAX;
-               exact = false;
-       }
-
-       if (unlikely(in.offset <
-                    le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
-               if (!in.inode--)
-                       return BKEY_PACK_POS_FAIL;
-               in.offset       = KEY_OFFSET_MAX;
-               in.snapshot     = KEY_SNAPSHOT_MAX;
-               exact = false;
-       }
-
-       if (unlikely(in.inode <
-                    le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
-               return BKEY_PACK_POS_FAIL;
-
-       if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
-               in.offset       = KEY_OFFSET_MAX;
-               in.snapshot     = KEY_SNAPSHOT_MAX;
-               exact = false;
-       }
-
-       if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
-               in.snapshot     = KEY_SNAPSHOT_MAX;
-               exact = false;
-       }
-
-       if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
-               exact = false;
-
-       pack_state_finish(&state, out);
-       out->u64s       = f->key_u64s;
-       out->format     = KEY_FORMAT_LOCAL_BTREE;
-       out->type       = KEY_TYPE_deleted;
-
-       if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
-               if (exact) {
-                       BUG_ON(bkey_cmp_left_packed(b, out, &orig));
-               } else {
-                       struct bkey_packed successor;
-
-                       BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
-                       BUG_ON(bkey_packed_successor(&successor, b, *out) &&
-                              bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
-                              !bkey_format_has_too_big_fields(f));
-               }
-       }
-
-       return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
-}
-
-void bch2_bkey_format_init(struct bkey_format_state *s)
-{
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
-               s->field_min[i] = U64_MAX;
-
-       for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
-               s->field_max[i] = 0;
-
-       /* Make sure we can store a size of 0: */
-       s->field_min[BKEY_FIELD_SIZE] = 0;
-}
-
-void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
-{
-       unsigned field = 0;
-
-       __bkey_format_add(s, field++, p.inode);
-       __bkey_format_add(s, field++, p.offset);
-       __bkey_format_add(s, field++, p.snapshot);
-}
-
-/*
- * We don't want it to be possible for the packed format to represent fields
- * bigger than a u64... that will cause confusion and issues (like with
- * bkey_packed_successor())
- */
-static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
-                            unsigned bits, u64 offset)
-{
-       unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-       u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-
-       bits = min(bits, unpacked_bits);
-
-       offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
-
-       f->bits_per_field[i]    = bits;
-       f->field_offset[i]      = cpu_to_le64(offset);
-}
-
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
-{
-       unsigned i, bits = KEY_PACKED_BITS_START;
-       struct bkey_format ret = {
-               .nr_fields = BKEY_NR_FIELDS,
-       };
-
-       for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
-               s->field_min[i] = min(s->field_min[i], s->field_max[i]);
-
-               set_format_field(&ret, i,
-                                fls64(s->field_max[i] - s->field_min[i]),
-                                s->field_min[i]);
-
-               bits += ret.bits_per_field[i];
-       }
-
-       /* allow for extent merging: */
-       if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
-               unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
-
-               ret.bits_per_field[BKEY_FIELD_SIZE] += b;
-               bits += b;
-       }
-
-       ret.key_u64s = DIV_ROUND_UP(bits, 64);
-
-       /* if we have enough spare bits, round fields up to nearest byte */
-       bits = ret.key_u64s * 64 - bits;
-
-       for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
-               unsigned r = round_up(ret.bits_per_field[i], 8) -
-                       ret.bits_per_field[i];
-
-               if (r <= bits) {
-                       set_format_field(&ret, i,
-                                        ret.bits_per_field[i] + r,
-                                        le64_to_cpu(ret.field_offset[i]));
-                       bits -= r;
-               }
-       }
-
-       if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
-               struct printbuf buf = PRINTBUF;
-
-               BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
-               printbuf_exit(&buf);
-       }
-
-       return ret;
-}
-
-int bch2_bkey_format_invalid(struct bch_fs *c,
-                            struct bkey_format *f,
-                            enum bch_validate_flags flags,
-                            struct printbuf *err)
-{
-       unsigned bits = KEY_PACKED_BITS_START;
-
-       if (f->nr_fields != BKEY_NR_FIELDS) {
-               prt_printf(err, "incorrect number of fields: got %u, should be %u",
-                          f->nr_fields, BKEY_NR_FIELDS);
-               return -BCH_ERR_invalid;
-       }
-
-       /*
-        * Verify that the packed format can't represent fields larger than the
-        * unpacked format:
-        */
-       for (unsigned i = 0; i < f->nr_fields; i++) {
-               if (bch2_bkey_format_field_overflows(f, i)) {
-                       unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-                       u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-                       unsigned packed_bits = min(64, f->bits_per_field[i]);
-                       u64 packed_max = packed_bits
-                               ? ~((~0ULL << 1) << (packed_bits - 1))
-                               : 0;
-
-                       prt_printf(err, "field %u too large: %llu + %llu > %llu",
-                                  i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max);
-                       return -BCH_ERR_invalid;
-               }
-
-               bits += f->bits_per_field[i];
-       }
-
-       if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
-               prt_printf(err, "incorrect key_u64s: got %u, should be %u",
-                          f->key_u64s, DIV_ROUND_UP(bits, 64));
-               return -BCH_ERR_invalid;
-       }
-
-       return 0;
-}
-
-void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
-{
-       prt_printf(out, "u64s %u fields ", f->key_u64s);
-
-       for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
-               if (i)
-                       prt_str(out, ", ");
-               prt_printf(out, "%u:%llu",
-                          f->bits_per_field[i],
-                          le64_to_cpu(f->field_offset[i]));
-       }
-}
-
-/*
- * Most significant differing bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
-                                         const struct bkey_packed *l_k,
-                                         const struct bkey_packed *r_k)
-{
-       const u64 *l = high_word(&b->format, l_k);
-       const u64 *r = high_word(&b->format, r_k);
-       unsigned nr_key_bits = b->nr_key_bits;
-       unsigned word_bits = 64 - high_bit_offset;
-       u64 l_v, r_v;
-
-       EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
-       /* for big endian, skip past header */
-       l_v = *l & (~0ULL >> high_bit_offset);
-       r_v = *r & (~0ULL >> high_bit_offset);
-
-       while (nr_key_bits) {
-               if (nr_key_bits < word_bits) {
-                       l_v >>= word_bits - nr_key_bits;
-                       r_v >>= word_bits - nr_key_bits;
-                       nr_key_bits = 0;
-               } else {
-                       nr_key_bits -= word_bits;
-               }
-
-               if (l_v != r_v)
-                       return fls64(l_v ^ r_v) - 1 + nr_key_bits;
-
-               l = next_word(l);
-               r = next_word(r);
-
-               l_v = *l;
-               r_v = *r;
-               word_bits = 64;
-       }
-
-       return 0;
-}
-
-/*
- * First set bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
-{
-       const u64 *p = high_word(&b->format, k);
-       unsigned nr_key_bits = b->nr_key_bits;
-       unsigned ret = 0, offset;
-
-       EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
-       offset = nr_key_bits;
-       while (offset > 64) {
-               p = next_word(p);
-               offset -= 64;
-       }
-
-       offset = 64 - offset;
-
-       while (nr_key_bits) {
-               unsigned bits = nr_key_bits + offset < 64
-                       ? nr_key_bits
-                       : 64 - offset;
-
-               u64 mask = (~0ULL >> (64 - bits)) << offset;
-
-               if (*p & mask)
-                       return ret + __ffs64(*p & mask) - offset;
-
-               p = prev_word(p);
-               nr_key_bits -= bits;
-               ret += bits;
-               offset = 0;
-       }
-
-       return 0;
-}
-
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-
-#define I(_x)                  (*(out)++ = (_x))
-#define I1(i0)                                         I(i0)
-#define I2(i0, i1)             (I1(i0),                I(i1))
-#define I3(i0, i1, i2)         (I2(i0, i1),            I(i2))
-#define I4(i0, i1, i2, i3)     (I3(i0, i1, i2),        I(i3))
-#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3),    I(i4))
-
-static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
-                             enum bch_bkey_fields field,
-                             unsigned dst_offset, unsigned dst_size,
-                             bool *eax_zeroed)
-{
-       unsigned bits = format->bits_per_field[field];
-       u64 offset = le64_to_cpu(format->field_offset[field]);
-       unsigned i, byte, bit_offset, align, shl, shr;
-
-       if (!bits && !offset) {
-               if (!*eax_zeroed) {
-                       /* xor eax, eax */
-                       I2(0x31, 0xc0);
-               }
-
-               *eax_zeroed = true;
-               goto set_field;
-       }
-
-       if (!bits) {
-               /* just return offset: */
-
-               switch (dst_size) {
-               case 8:
-                       if (offset > S32_MAX) {
-                               /* mov [rdi + dst_offset], offset */
-                               I3(0xc7, 0x47, dst_offset);
-                               memcpy(out, &offset, 4);
-                               out += 4;
-
-                               I3(0xc7, 0x47, dst_offset + 4);
-                               memcpy(out, (void *) &offset + 4, 4);
-                               out += 4;
-                       } else {
-                               /* mov [rdi + dst_offset], offset */
-                               /* sign extended */
-                               I4(0x48, 0xc7, 0x47, dst_offset);
-                               memcpy(out, &offset, 4);
-                               out += 4;
-                       }
-                       break;
-               case 4:
-                       /* mov [rdi + dst_offset], offset */
-                       I3(0xc7, 0x47, dst_offset);
-                       memcpy(out, &offset, 4);
-                       out += 4;
-                       break;
-               default:
-                       BUG();
-               }
-
-               return out;
-       }
-
-       bit_offset = format->key_u64s * 64;
-       for (i = 0; i <= field; i++)
-               bit_offset -= format->bits_per_field[i];
-
-       byte = bit_offset / 8;
-       bit_offset -= byte * 8;
-
-       *eax_zeroed = false;
-
-       if (bit_offset == 0 && bits == 8) {
-               /* movzx eax, BYTE PTR [rsi + imm8] */
-               I4(0x0f, 0xb6, 0x46, byte);
-       } else if (bit_offset == 0 && bits == 16) {
-               /* movzx eax, WORD PTR [rsi + imm8] */
-               I4(0x0f, 0xb7, 0x46, byte);
-       } else if (bit_offset + bits <= 32) {
-               align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
-               byte -= align;
-               bit_offset += align * 8;
-
-               BUG_ON(bit_offset + bits > 32);
-
-               /* mov eax, [rsi + imm8] */
-               I3(0x8b, 0x46, byte);
-
-               if (bit_offset) {
-                       /* shr eax, imm8 */
-                       I3(0xc1, 0xe8, bit_offset);
-               }
-
-               if (bit_offset + bits < 32) {
-                       unsigned mask = ~0U >> (32 - bits);
-
-                       /* and eax, imm32 */
-                       I1(0x25);
-                       memcpy(out, &mask, 4);
-                       out += 4;
-               }
-       } else if (bit_offset + bits <= 64) {
-               align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
-               byte -= align;
-               bit_offset += align * 8;
-
-               BUG_ON(bit_offset + bits > 64);
-
-               /* mov rax, [rsi + imm8] */
-               I4(0x48, 0x8b, 0x46, byte);
-
-               shl = 64 - bit_offset - bits;
-               shr = bit_offset + shl;
-
-               if (shl) {
-                       /* shl rax, imm8 */
-                       I4(0x48, 0xc1, 0xe0, shl);
-               }
-
-               if (shr) {
-                       /* shr rax, imm8 */
-                       I4(0x48, 0xc1, 0xe8, shr);
-               }
-       } else {
-               align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
-               byte -= align;
-               bit_offset += align * 8;
-
-               BUG_ON(bit_offset + bits > 96);
-
-               /* mov rax, [rsi + byte] */
-               I4(0x48, 0x8b, 0x46, byte);
-
-               /* mov edx, [rsi + byte + 8] */
-               I3(0x8b, 0x56, byte + 8);
-
-               /* bits from next word: */
-               shr = bit_offset + bits - 64;
-               BUG_ON(shr > bit_offset);
-
-               /* shr rax, bit_offset */
-               I4(0x48, 0xc1, 0xe8, shr);
-
-               /* shl rdx, imm8 */
-               I4(0x48, 0xc1, 0xe2, 64 - shr);
-
-               /* or rax, rdx */
-               I3(0x48, 0x09, 0xd0);
-
-               shr = bit_offset - shr;
-
-               if (shr) {
-                       /* shr rax, imm8 */
-                       I4(0x48, 0xc1, 0xe8, shr);
-               }
-       }
-
-       /* rax += offset: */
-       if (offset > S32_MAX) {
-               /* mov rdx, imm64 */
-               I2(0x48, 0xba);
-               memcpy(out, &offset, 8);
-               out += 8;
-               /* add %rdx, %rax */
-               I3(0x48, 0x01, 0xd0);
-       } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
-               /* add rax, imm32 */
-               I2(0x48, 0x05);
-               memcpy(out, &offset, 4);
-               out += 4;
-       } else if (offset) {
-               /* add eax, imm32 */
-               I1(0x05);
-               memcpy(out, &offset, 4);
-               out += 4;
-       }
-set_field:
-       switch (dst_size) {
-       case 8:
-               /* mov [rdi + dst_offset], rax */
-               I4(0x48, 0x89, 0x47, dst_offset);
-               break;
-       case 4:
-               /* mov [rdi + dst_offset], eax */
-               I3(0x89, 0x47, dst_offset);
-               break;
-       default:
-               BUG();
-       }
-
-       return out;
-}
-
-int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
-{
-       bool eax_zeroed = false;
-       u8 *out = _out;
-
-       /*
-        * rdi: dst - unpacked key
-        * rsi: src - packed key
-        */
-
-       /* k->u64s, k->format, k->type */
-
-       /* mov eax, [rsi] */
-       I2(0x8b, 0x06);
-
-       /* add eax, BKEY_U64s - format->key_u64s */
-       I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
-
-       /* and eax, imm32: mask out k->pad: */
-       I5(0x25, 0xff, 0xff, 0xff, 0);
-
-       /* mov [rdi], eax */
-       I2(0x89, 0x07);
-
-#define x(id, field)                                                   \
-       out = compile_bkey_field(format, out, id,                       \
-                                offsetof(struct bkey, field),          \
-                                sizeof(((struct bkey *) NULL)->field), \
-                                &eax_zeroed);
-       bkey_fields()
-#undef x
-
-       /* retq */
-       I1(0xc3);
-
-       return (void *) out - _out;
-}
-
-#else
-#endif
-
-__pure
-int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
-                                         const struct bkey_packed *r,
-                                         const struct btree *b)
-{
-       return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
-}
-
-__pure __flatten
-int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
-                                              const struct bkey_packed *l,
-                                              const struct bpos *r)
-{
-       return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
-}
-
-__pure __flatten
-int bch2_bkey_cmp_packed(const struct btree *b,
-                        const struct bkey_packed *l,
-                        const struct bkey_packed *r)
-{
-       return bch2_bkey_cmp_packed_inlined(b, l, r);
-}
-
-__pure __flatten
-int __bch2_bkey_cmp_left_packed(const struct btree *b,
-                               const struct bkey_packed *l,
-                               const struct bpos *r)
-{
-       const struct bkey *l_unpacked;
-
-       return unlikely(l_unpacked = packed_to_bkey_c(l))
-               ? bpos_cmp(l_unpacked->p, *r)
-               : __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-void bch2_bpos_swab(struct bpos *p)
-{
-       u8 *l = (u8 *) p;
-       u8 *h = ((u8 *) &p[1]) - 1;
-
-       while (l < h) {
-               swap(*l, *h);
-               l++;
-               --h;
-       }
-}
-
-void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
-{
-       const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
-       u8 *l = k->key_start;
-       u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1;
-
-       while (l < h) {
-               swap(*l, *h);
-               l++;
-               --h;
-       }
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_bkey_pack_test(void)
-{
-       struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
-       struct bkey_packed p;
-
-       struct bkey_format test_format = {
-               .key_u64s       = 3,
-               .nr_fields      = BKEY_NR_FIELDS,
-               .bits_per_field = {
-                       13,
-                       64,
-                       32,
-               },
-       };
-
-       struct unpack_state in_s =
-               unpack_state_init(&bch2_bkey_format_current, (void *) &t);
-       struct pack_state out_s = pack_state_init(&test_format, &p);
-       unsigned i;
-
-       for (i = 0; i < out_s.format->nr_fields; i++) {
-               u64 a, v = get_inc_field(&in_s, i);
-
-               switch (i) {
-#define x(id, field)   case id: a = t.field; break;
-       bkey_fields()
-#undef x
-               default:
-                       BUG();
-               }
-
-               if (a != v)
-                       panic("got %llu actual %llu i %u\n", v, a, i);
-
-               if (!set_inc_field(&out_s, i, v))
-                       panic("failed at %u\n", i);
-       }
-
-       BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
-}
-#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
deleted file mode 100644 (file)
index 3ccd521..0000000
+++ /dev/null
@@ -1,605 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_H
-#define _BCACHEFS_BKEY_H
-
-#include <linux/bug.h>
-#include "bcachefs_format.h"
-#include "bkey_types.h"
-#include "btree_types.h"
-#include "util.h"
-#include "vstructs.h"
-
-#if 0
-
-/*
- * compiled unpack functions are disabled, pending a new interface for
- * dynamically allocating executable memory:
- */
-
-#ifdef CONFIG_X86_64
-#define HAVE_BCACHEFS_COMPILED_UNPACK  1
-#endif
-#endif
-
-void bch2_bkey_packed_to_binary_text(struct printbuf *,
-                                    const struct bkey_format *,
-                                    const struct bkey_packed *);
-
-enum bkey_lr_packed {
-       BKEY_PACKED_BOTH,
-       BKEY_PACKED_RIGHT,
-       BKEY_PACKED_LEFT,
-       BKEY_PACKED_NONE,
-};
-
-#define bkey_lr_packed(_l, _r)                                         \
-       ((_l)->format + ((_r)->format << 1))
-
-static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
-{
-       memcpy_u64s_small(dst, src, src->u64s);
-}
-
-static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
-{
-       memcpy_u64s_small(dst, src, src->k.u64s);
-}
-
-struct btree;
-
-__pure
-unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
-                                         const struct bkey_packed *,
-                                         const struct bkey_packed *);
-__pure
-unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
-
-__pure
-int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
-                                    const struct bkey_packed *,
-                                    const struct btree *);
-
-__pure
-int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
-                                         const struct bkey_packed *,
-                                         const struct bpos *);
-
-__pure
-int bch2_bkey_cmp_packed(const struct btree *,
-                        const struct bkey_packed *,
-                        const struct bkey_packed *);
-
-__pure
-int __bch2_bkey_cmp_left_packed(const struct btree *,
-                               const struct bkey_packed *,
-                               const struct bpos *);
-
-static inline __pure
-int bkey_cmp_left_packed(const struct btree *b,
-                        const struct bkey_packed *l, const struct bpos *r)
-{
-       return __bch2_bkey_cmp_left_packed(b, l, r);
-}
-
-/*
- * The compiler generates better code when we pass bpos by ref, but it's often
- * enough terribly convenient to pass it by val... as much as I hate c++, const
- * ref would be nice here:
- */
-__pure __flatten
-static inline int bkey_cmp_left_packed_byval(const struct btree *b,
-                                            const struct bkey_packed *l,
-                                            struct bpos r)
-{
-       return bkey_cmp_left_packed(b, l, &r);
-}
-
-static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
-{
-       return  !((l.inode      ^ r.inode) |
-                 (l.offset     ^ r.offset) |
-                 (l.snapshot   ^ r.snapshot));
-}
-
-static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
-{
-       return  l.inode != r.inode ? l.inode < r.inode :
-               l.offset != r.offset ? l.offset < r.offset :
-               l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
-}
-
-static __always_inline bool bpos_le(struct bpos l, struct bpos r)
-{
-       return  l.inode != r.inode ? l.inode < r.inode :
-               l.offset != r.offset ? l.offset < r.offset :
-               l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
-}
-
-static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
-{
-       return bpos_lt(r, l);
-}
-
-static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
-{
-       return bpos_le(r, l);
-}
-
-static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
-{
-       return  cmp_int(l.inode,    r.inode) ?:
-               cmp_int(l.offset,   r.offset) ?:
-               cmp_int(l.snapshot, r.snapshot);
-}
-
-static inline struct bpos bpos_min(struct bpos l, struct bpos r)
-{
-       return bpos_lt(l, r) ? l : r;
-}
-
-static inline struct bpos bpos_max(struct bpos l, struct bpos r)
-{
-       return bpos_gt(l, r) ? l : r;
-}
-
-static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
-{
-       return  !((l.inode      ^ r.inode) |
-                 (l.offset     ^ r.offset));
-}
-
-static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
-{
-       return  l.inode != r.inode
-               ? l.inode < r.inode
-               : l.offset < r.offset;
-}
-
-static __always_inline bool bkey_le(struct bpos l, struct bpos r)
-{
-       return  l.inode != r.inode
-               ? l.inode < r.inode
-               : l.offset <= r.offset;
-}
-
-static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
-{
-       return bkey_lt(r, l);
-}
-
-static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
-{
-       return bkey_le(r, l);
-}
-
-static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
-{
-       return  cmp_int(l.inode,    r.inode) ?:
-               cmp_int(l.offset,   r.offset);
-}
-
-static inline struct bpos bkey_min(struct bpos l, struct bpos r)
-{
-       return bkey_lt(l, r) ? l : r;
-}
-
-static inline struct bpos bkey_max(struct bpos l, struct bpos r)
-{
-       return bkey_gt(l, r) ? l : r;
-}
-
-static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
-{
-       return bpos_eq(l.k->p, r.k->p) &&
-               l.k->size == r.k->size &&
-               bkey_bytes(l.k) == bkey_bytes(r.k) &&
-               !memcmp(l.v, r.v, bkey_val_bytes(l.k));
-}
-
-void bch2_bpos_swab(struct bpos *);
-void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
-
-static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
-{
-       return  cmp_int(l.hi, r.hi) ?:
-               cmp_int(l.lo, r.lo);
-}
-
-#define ZERO_VERSION   ((struct bversion) { .hi = 0, .lo = 0 })
-#define MAX_VERSION    ((struct bversion) { .hi = ~0, .lo = ~0ULL })
-
-static __always_inline bool bversion_zero(struct bversion v)
-{
-       return bversion_cmp(v, ZERO_VERSION) == 0;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-/* statement expressions confusing unlikely()? */
-#define bkey_packed(_k)                                                        \
-       ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);                  \
-        (_k)->format != KEY_FORMAT_CURRENT; })
-#else
-#define bkey_packed(_k)                ((_k)->format != KEY_FORMAT_CURRENT)
-#endif
-
-/*
- * It's safe to treat an unpacked bkey as a packed one, but not the reverse
- */
-static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
-{
-       return (struct bkey_packed *) k;
-}
-
-static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
-{
-       return (const struct bkey_packed *) k;
-}
-
-static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
-{
-       return bkey_packed(k) ? NULL : (struct bkey_i *) k;
-}
-
-static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
-{
-       return bkey_packed(k) ? NULL : (const struct bkey *) k;
-}
-
-static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
-{
-       return format->bits_per_field[BKEY_FIELD_INODE] +
-               format->bits_per_field[BKEY_FIELD_OFFSET] +
-               format->bits_per_field[BKEY_FIELD_SNAPSHOT];
-}
-
-static inline struct bpos bpos_successor(struct bpos p)
-{
-       if (!++p.snapshot &&
-           !++p.offset &&
-           !++p.inode)
-               BUG();
-
-       return p;
-}
-
-static inline struct bpos bpos_predecessor(struct bpos p)
-{
-       if (!p.snapshot-- &&
-           !p.offset-- &&
-           !p.inode--)
-               BUG();
-
-       return p;
-}
-
-static inline struct bpos bpos_nosnap_successor(struct bpos p)
-{
-       p.snapshot = 0;
-
-       if (!++p.offset &&
-           !++p.inode)
-               BUG();
-
-       return p;
-}
-
-static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
-{
-       p.snapshot = 0;
-
-       if (!p.offset-- &&
-           !p.inode--)
-               BUG();
-
-       return p;
-}
-
-static inline u64 bkey_start_offset(const struct bkey *k)
-{
-       return k->p.offset - k->size;
-}
-
-static inline struct bpos bkey_start_pos(const struct bkey *k)
-{
-       return (struct bpos) {
-               .inode          = k->p.inode,
-               .offset         = bkey_start_offset(k),
-               .snapshot       = k->p.snapshot,
-       };
-}
-
-/* Packed helpers */
-
-static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
-                                     const struct bkey_packed *k)
-{
-       return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
-}
-
-static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
-                                   const struct bkey_packed *k)
-{
-       return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
-}
-
-static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
-                                      const struct bkey_packed *k)
-{
-       return bkeyp_key_u64s(format, k) * sizeof(u64);
-}
-
-static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
-                                     const struct bkey_packed *k)
-{
-       return k->u64s - bkeyp_key_u64s(format, k);
-}
-
-static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
-                                    const struct bkey_packed *k)
-{
-       return bkeyp_val_u64s(format, k) * sizeof(u64);
-}
-
-static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
-                                     struct bkey_packed *k, unsigned val_u64s)
-{
-       k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
-}
-
-#define bkeyp_val(_format, _k)                                         \
-        ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
-
-extern const struct bkey_format bch2_bkey_format_current;
-
-bool bch2_bkey_transform(const struct bkey_format *,
-                        struct bkey_packed *,
-                        const struct bkey_format *,
-                        const struct bkey_packed *);
-
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
-                                  const struct bkey_packed *);
-
-#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *,
-                             const struct bkey_packed *);
-#endif
-
-bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
-                  const struct bkey_format *);
-
-enum bkey_pack_pos_ret {
-       BKEY_PACK_POS_EXACT,
-       BKEY_PACK_POS_SMALLER,
-       BKEY_PACK_POS_FAIL,
-};
-
-enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
-                                          const struct btree *);
-
-static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
-                                const struct btree *b)
-{
-       return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
-}
-
-void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
-                const struct bkey_packed *);
-bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
-              const struct bkey_format *);
-
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline void
-__bkey_unpack_key_format_checked(const struct btree *b,
-                              struct bkey *dst,
-                              const struct bkey_packed *src)
-{
-       if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
-               compiled_unpack_fn unpack_fn = b->aux_data;
-               unpack_fn(dst, src);
-
-               if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
-                       struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-
-                       BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
-               }
-       } else {
-               *dst = __bch2_bkey_unpack_key(&b->format, src);
-       }
-}
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
-                              const struct bkey_packed *src)
-{
-       struct bkey dst;
-
-       __bkey_unpack_key_format_checked(b, &dst, src);
-       return dst;
-}
-
-static inline void __bkey_unpack_key(const struct btree *b,
-                                    struct bkey *dst,
-                                    const struct bkey_packed *src)
-{
-       if (likely(bkey_packed(src)))
-               __bkey_unpack_key_format_checked(b, dst, src);
-       else
-               *dst = *packed_to_bkey_c(src);
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
- */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
-                                         const struct bkey_packed *src)
-{
-       return likely(bkey_packed(src))
-               ? bkey_unpack_key_format_checked(b, src)
-               : *packed_to_bkey_c(src);
-}
-
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
-                              const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-       return bkey_unpack_key_format_checked(b, src).p;
-#else
-       return __bkey_unpack_pos(&b->format, src);
-#endif
-}
-
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
-                                         const struct bkey_packed *src)
-{
-       return likely(bkey_packed(src))
-               ? bkey_unpack_pos_format_checked(b, src)
-               : packed_to_bkey_c(src)->p;
-}
-
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
-                                              const struct bkey_packed *k,
-                                              struct bkey *u)
-{
-       __bkey_unpack_key(b, u, k);
-
-       return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-}
-
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(const struct btree *b,
-                                              struct bkey_packed *k,
-                                              struct bkey *u)
-{
-       __bkey_unpack_key(b, u, k);
-
-       return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
-
-static inline u64 bkey_field_max(const struct bkey_format *f,
-                                enum bch_bkey_fields nr)
-{
-       return f->bits_per_field[nr] < 64
-               ? (le64_to_cpu(f->field_offset[nr]) +
-                  ~(~0ULL << f->bits_per_field[nr]))
-               : U64_MAX;
-}
-
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-
-int bch2_compile_bkey_format(const struct bkey_format *, void *);
-
-#else
-
-static inline int bch2_compile_bkey_format(const struct bkey_format *format,
-                                         void *out) { return 0; }
-
-#endif
-
-static inline void bkey_reassemble(struct bkey_i *dst,
-                                  struct bkey_s_c src)
-{
-       dst->k = *src.k;
-       memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
-}
-
-/* byte order helpers */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-
-static inline unsigned high_word_offset(const struct bkey_format *f)
-{
-       return f->key_u64s - 1;
-}
-
-#define high_bit_offset                0
-#define nth_word(p, n)         ((p) - (n))
-
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-
-static inline unsigned high_word_offset(const struct bkey_format *f)
-{
-       return 0;
-}
-
-#define high_bit_offset                KEY_PACKED_BITS_START
-#define nth_word(p, n)         ((p) + (n))
-
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define high_word(f, k)                ((u64 *) (k)->_data + high_word_offset(f))
-#define next_word(p)           nth_word(p, 1)
-#define prev_word(p)           nth_word(p, -1)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_bkey_pack_test(void);
-#else
-static inline void bch2_bkey_pack_test(void) {}
-#endif
-
-#define bkey_fields()                                                  \
-       x(BKEY_FIELD_INODE,             p.inode)                        \
-       x(BKEY_FIELD_OFFSET,            p.offset)                       \
-       x(BKEY_FIELD_SNAPSHOT,          p.snapshot)                     \
-       x(BKEY_FIELD_SIZE,              size)                           \
-       x(BKEY_FIELD_VERSION_HI,        bversion.hi)                    \
-       x(BKEY_FIELD_VERSION_LO,        bversion.lo)
-
-struct bkey_format_state {
-       u64 field_min[BKEY_NR_FIELDS];
-       u64 field_max[BKEY_NR_FIELDS];
-};
-
-void bch2_bkey_format_init(struct bkey_format_state *);
-
-static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
-{
-       s->field_min[field] = min(s->field_min[field], v);
-       s->field_max[field] = max(s->field_max[field], v);
-}
-
-/*
- * Changes @format so that @k can be successfully packed with @format
- */
-static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
-{
-#define x(id, field) __bkey_format_add(s, id, k->field);
-       bkey_fields()
-#undef x
-}
-
-void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-
-static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i)
-{
-       unsigned f_bits = f->bits_per_field[i];
-       unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-       u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
-       u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
-       if (f_bits > unpacked_bits)
-               return true;
-
-       if ((f_bits == unpacked_bits) && field_offset)
-               return true;
-
-       u64 f_mask = f_bits
-               ? ~((~0ULL << (f_bits - 1)) << 1)
-               : 0;
-
-       if (((field_offset + f_mask) & unpacked_mask) < field_offset)
-               return true;
-       return false;
-}
-
-int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
-                            enum bch_validate_flags, struct printbuf *);
-void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
-
-#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
deleted file mode 100644 (file)
index a30c4ae..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_BUF_H
-#define _BCACHEFS_BKEY_BUF_H
-
-#include "bcachefs.h"
-#include "bkey.h"
-
-struct bkey_buf {
-       struct bkey_i   *k;
-       u64             onstack[12];
-};
-
-static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
-                                        struct bch_fs *c, unsigned u64s)
-{
-       if (s->k == (void *) s->onstack &&
-           u64s > ARRAY_SIZE(s->onstack)) {
-               s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
-               memcpy(s->k, s->onstack, sizeof(s->onstack));
-       }
-}
-
-static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
-                                           struct bch_fs *c,
-                                           struct bkey_s_c k)
-{
-       bch2_bkey_buf_realloc(s, c, k.k->u64s);
-       bkey_reassemble(s->k, k);
-}
-
-static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
-                                     struct bch_fs *c,
-                                     struct bkey_i *src)
-{
-       bch2_bkey_buf_realloc(s, c, src->k.u64s);
-       bkey_copy(s->k, src);
-}
-
-static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
-                                       struct bch_fs *c,
-                                       struct btree *b,
-                                       struct bkey_packed *src)
-{
-       bch2_bkey_buf_realloc(s, c, BKEY_U64s +
-                             bkeyp_val_u64s(&b->format, src));
-       bch2_bkey_unpack(b, s->k, src);
-}
-
-static inline void bch2_bkey_buf_init(struct bkey_buf *s)
-{
-       s->k = (void *) s->onstack;
-}
-
-static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
-{
-       if (s->k != (void *) s->onstack)
-               mempool_free(s->k, &c->large_bkey_pool);
-       s->k = NULL;
-}
-
-#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
deleted file mode 100644 (file)
index 5f42a6e..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_CMP_H
-#define _BCACHEFS_BKEY_CMP_H
-
-#include "bkey.h"
-
-#ifdef CONFIG_X86_64
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-                                 unsigned nr_key_bits)
-{
-       long d0, d1, d2, d3;
-       int cmp;
-
-       /* we shouldn't need asm for this, but gcc is being retarded: */
-
-       asm(".intel_syntax noprefix;"
-           "xor eax, eax;"
-           "xor edx, edx;"
-           "1:;"
-           "mov r8, [rdi];"
-           "mov r9, [rsi];"
-           "sub ecx, 64;"
-           "jl 2f;"
-
-           "cmp r8, r9;"
-           "jnz 3f;"
-
-           "lea rdi, [rdi - 8];"
-           "lea rsi, [rsi - 8];"
-           "jmp 1b;"
-
-           "2:;"
-           "not ecx;"
-           "shr r8, 1;"
-           "shr r9, 1;"
-           "shr r8, cl;"
-           "shr r9, cl;"
-           "cmp r8, r9;"
-
-           "3:\n"
-           "seta al;"
-           "setb dl;"
-           "sub eax, edx;"
-           ".att_syntax prefix;"
-           : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
-           : "0" (l), "1" (r), "3" (nr_key_bits)
-           : "r8", "r9", "cc", "memory");
-
-       return cmp;
-}
-#else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-                                 unsigned nr_key_bits)
-{
-       u64 l_v, r_v;
-
-       if (!nr_key_bits)
-               return 0;
-
-       /* for big endian, skip past header */
-       nr_key_bits += high_bit_offset;
-       l_v = *l & (~0ULL >> high_bit_offset);
-       r_v = *r & (~0ULL >> high_bit_offset);
-
-       while (1) {
-               if (nr_key_bits < 64) {
-                       l_v >>= 64 - nr_key_bits;
-                       r_v >>= 64 - nr_key_bits;
-                       nr_key_bits = 0;
-               } else {
-                       nr_key_bits -= 64;
-               }
-
-               if (!nr_key_bits || l_v != r_v)
-                       break;
-
-               l = next_word(l);
-               r = next_word(r);
-
-               l_v = *l;
-               r_v = *r;
-       }
-
-       return cmp_int(l_v, r_v);
-}
-#endif
-
-static inline __pure __flatten
-int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
-                                         const struct bkey_packed *r,
-                                         const struct btree *b)
-{
-       const struct bkey_format *f = &b->format;
-       int ret;
-
-       EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
-       EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
-       ret = __bkey_cmp_bits(high_word(f, l),
-                             high_word(f, r),
-                             b->nr_key_bits);
-
-       EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
-                               bkey_unpack_pos(b, r)));
-       return ret;
-}
-
-static inline __pure __flatten
-int bch2_bkey_cmp_packed_inlined(const struct btree *b,
-                        const struct bkey_packed *l,
-                        const struct bkey_packed *r)
-{
-       struct bkey unpacked;
-
-       if (likely(bkey_packed(l) && bkey_packed(r)))
-               return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
-
-       if (bkey_packed(l)) {
-               __bkey_unpack_key_format_checked(b, &unpacked, l);
-               l = (void *) &unpacked;
-       } else if (bkey_packed(r)) {
-               __bkey_unpack_key_format_checked(b, &unpacked, r);
-               r = (void *) &unpacked;
-       }
-
-       return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
-}
-
-#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
deleted file mode 100644 (file)
index fcd8c82..0000000
+++ /dev/null
@@ -1,497 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_types.h"
-#include "alloc_background.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "lru.h"
-#include "quota.h"
-#include "reflink.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "xattr.h"
-
-const char * const bch2_bkey_types[] = {
-#define x(name, nr, ...) #name,
-       BCH_BKEY_TYPES()
-#undef x
-       NULL
-};
-
-static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
-                               struct bkey_validate_context from)
-{
-       return 0;
-}
-
-#define bch2_bkey_ops_deleted ((struct bkey_ops) {     \
-       .key_validate   = deleted_key_validate,         \
-})
-
-#define bch2_bkey_ops_whiteout ((struct bkey_ops) {    \
-       .key_validate   = deleted_key_validate,         \
-})
-
-static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
-                                 struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_val_bytes(k.k),
-                        c, bkey_val_size_nonzero,
-                        "incorrect value size (%zu != 0)",
-                        bkey_val_bytes(k.k));
-fsck_err:
-       return ret;
-}
-
-#define bch2_bkey_ops_error ((struct bkey_ops) {       \
-       .key_validate = empty_val_key_validate,         \
-})
-
-static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
-                                   struct bkey_validate_context from)
-{
-       return 0;
-}
-
-static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
-                                   struct bkey_s_c k)
-{
-       struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
-
-       prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
-}
-
-#define bch2_bkey_ops_cookie ((struct bkey_ops) {      \
-       .key_validate   = key_type_cookie_validate,     \
-       .val_to_text    = key_type_cookie_to_text,      \
-       .min_val_size   = 8,                            \
-})
-
-#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
-       .key_validate   = empty_val_key_validate,       \
-})
-
-static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
-                                        struct bkey_validate_context from)
-{
-       return 0;
-}
-
-static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
-                                        struct bkey_s_c k)
-{
-       struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
-       unsigned datalen = bkey_inline_data_bytes(k.k);
-
-       prt_printf(out, "datalen %u: %*phN",
-              datalen, min(datalen, 32U), d.v->data);
-}
-
-#define bch2_bkey_ops_inline_data ((struct bkey_ops) {         \
-       .key_validate   = key_type_inline_data_validate,        \
-       .val_to_text    = key_type_inline_data_to_text,         \
-})
-
-static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
-       bch2_key_resize(l.k, l.k->size + r.k->size);
-       return true;
-}
-
-#define bch2_bkey_ops_set ((struct bkey_ops) {         \
-       .key_validate   = empty_val_key_validate,       \
-       .key_merge      = key_type_set_merge,           \
-})
-
-const struct bkey_ops bch2_bkey_ops[] = {
-#define x(name, nr, ...) [KEY_TYPE_##name]     = bch2_bkey_ops_##name,
-       BCH_BKEY_TYPES()
-#undef x
-};
-
-const struct bkey_ops bch2_bkey_null_ops = {
-};
-
-int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
-                          struct bkey_validate_context from)
-{
-       if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
-               return 0;
-
-       const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size,
-                        c, bkey_val_size_too_small,
-                        "bad val size (%zu < %u)",
-                        bkey_val_bytes(k.k), ops->min_val_size);
-
-       if (!ops->key_validate)
-               return 0;
-
-       ret = ops->key_validate(c, k, from);
-fsck_err:
-       return ret;
-}
-
-static u64 bch2_key_types_allowed[] = {
-       [BKEY_TYPE_btree] =
-               BIT_ULL(KEY_TYPE_deleted)|
-               BIT_ULL(KEY_TYPE_btree_ptr)|
-               BIT_ULL(KEY_TYPE_btree_ptr_v2),
-#define x(name, nr, flags, keys)       [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
-       BCH_BTREE_IDS()
-#undef x
-};
-
-static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = {
-#define x(name, nr, flags)     [KEY_TYPE_##name] = flags,
-       BCH_BKEY_TYPES()
-#undef x
-};
-
-const char *bch2_btree_node_type_str(enum btree_node_type type)
-{
-       return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
-}
-
-int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
-                        struct bkey_validate_context from)
-{
-       enum btree_node_type type = __btree_node_type(from.level, from.btree);
-
-       if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
-               return 0;
-
-       int ret = 0;
-
-       bkey_fsck_err_on(k.k->u64s < BKEY_U64s,
-                        c, bkey_u64s_too_small,
-                        "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
-
-       if (type >= BKEY_TYPE_NR)
-               return 0;
-
-       enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX
-               ? bch2_bkey_type_flags[k.k->type]
-               : 0;
-
-       bool strict_key_type_allowed =
-               (from.flags & BCH_VALIDATE_commit) ||
-               type == BKEY_TYPE_btree ||
-               (from.btree < BTREE_ID_NR &&
-                (bkey_flags & BKEY_TYPE_strict_btree_checks));
-
-       bkey_fsck_err_on(strict_key_type_allowed &&
-                        k.k->type < KEY_TYPE_MAX &&
-                        !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
-                        c, bkey_invalid_type_for_btree,
-                        "invalid key type for btree %s (%s)",
-                        bch2_btree_node_type_str(type),
-                        k.k->type < KEY_TYPE_MAX
-                        ? bch2_bkey_types[k.k->type]
-                        : "(unknown)");
-
-       if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
-               bkey_fsck_err_on(k.k->size == 0,
-                                c, bkey_extent_size_zero,
-                                "size == 0");
-
-               bkey_fsck_err_on(k.k->size > k.k->p.offset,
-                                c, bkey_extent_size_greater_than_offset,
-                                "size greater than offset (%u > %llu)",
-                                k.k->size, k.k->p.offset);
-       } else {
-               bkey_fsck_err_on(k.k->size,
-                                c, bkey_size_nonzero,
-                                "size != 0");
-       }
-
-       if (type != BKEY_TYPE_btree) {
-               enum btree_id btree = type - 1;
-
-               if (btree_type_has_snapshots(btree)) {
-                       bkey_fsck_err_on(!k.k->p.snapshot,
-                                        c, bkey_snapshot_zero,
-                                        "snapshot == 0");
-               } else if (!btree_type_has_snapshot_field(btree)) {
-                       bkey_fsck_err_on(k.k->p.snapshot,
-                                        c, bkey_snapshot_nonzero,
-                                        "nonzero snapshot");
-               } else {
-                       /*
-                        * btree uses snapshot field but it's not required to be
-                        * nonzero
-                        */
-               }
-
-               bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX),
-                                c, bkey_at_pos_max,
-                                "key at POS_MAX");
-       }
-fsck_err:
-       return ret;
-}
-
-int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
-                      struct bkey_validate_context from)
-{
-       return __bch2_bkey_validate(c, k, from) ?:
-               bch2_bkey_val_validate(c, k, from);
-}
-
-int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
-                           struct bkey_s_c k,
-                           struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key),
-                        c, bkey_before_start_of_btree_node,
-                        "key before start of btree node");
-
-       bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key),
-                        c, bkey_after_end_of_btree_node,
-                        "key past end of btree node");
-fsck_err:
-       return ret;
-}
-
-void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
-{
-       if (bpos_eq(pos, POS_MIN))
-               prt_printf(out, "POS_MIN");
-       else if (bpos_eq(pos, POS_MAX))
-               prt_printf(out, "POS_MAX");
-       else if (bpos_eq(pos, SPOS_MAX))
-               prt_printf(out, "SPOS_MAX");
-       else {
-               if (pos.inode == U64_MAX)
-                       prt_printf(out, "U64_MAX");
-               else
-                       prt_printf(out, "%llu", pos.inode);
-               prt_printf(out, ":");
-               if (pos.offset == U64_MAX)
-                       prt_printf(out, "U64_MAX");
-               else
-                       prt_printf(out, "%llu", pos.offset);
-               prt_printf(out, ":");
-               if (pos.snapshot == U32_MAX)
-                       prt_printf(out, "U32_MAX");
-               else
-                       prt_printf(out, "%u", pos.snapshot);
-       }
-}
-
-void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
-{
-       if (k) {
-               prt_printf(out, "u64s %u type ", k->u64s);
-
-               if (k->type < KEY_TYPE_MAX)
-                       prt_printf(out, "%s ", bch2_bkey_types[k->type]);
-               else
-                       prt_printf(out, "%u ", k->type);
-
-               bch2_bpos_to_text(out, k->p);
-
-               prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo);
-       } else {
-               prt_printf(out, "(null)");
-       }
-}
-
-void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
-                     struct bkey_s_c k)
-{
-       const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
-       if (likely(ops->val_to_text))
-               ops->val_to_text(out, c, k);
-}
-
-void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
-                          struct bkey_s_c k)
-{
-       bch2_bkey_to_text(out, k.k);
-
-       if (bkey_val_bytes(k.k)) {
-               prt_printf(out, ": ");
-               bch2_val_to_text(out, c, k);
-       }
-}
-
-void bch2_bkey_swab_val(struct bkey_s k)
-{
-       const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
-       if (ops->swab)
-               ops->swab(k);
-}
-
-bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
-{
-       const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
-       return ops->key_normalize
-               ? ops->key_normalize(c, k)
-               : false;
-}
-
-bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
-       const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
-
-       return ops->key_merge &&
-               bch2_bkey_maybe_mergable(l.k, r.k) &&
-               (u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
-               !static_branch_unlikely(&bch2_key_merging_disabled) &&
-               ops->key_merge(c, l, r);
-}
-
-static const struct old_bkey_type {
-       u8              btree_node_type;
-       u8              old;
-       u8              new;
-} bkey_renumber_table[] = {
-       {BKEY_TYPE_btree,       128, KEY_TYPE_btree_ptr         },
-       {BKEY_TYPE_extents,     128, KEY_TYPE_extent            },
-       {BKEY_TYPE_extents,     129, KEY_TYPE_extent            },
-       {BKEY_TYPE_extents,     130, KEY_TYPE_reservation       },
-       {BKEY_TYPE_inodes,      128, KEY_TYPE_inode             },
-       {BKEY_TYPE_inodes,      130, KEY_TYPE_inode_generation  },
-       {BKEY_TYPE_dirents,     128, KEY_TYPE_dirent            },
-       {BKEY_TYPE_dirents,     129, KEY_TYPE_hash_whiteout     },
-       {BKEY_TYPE_xattrs,      128, KEY_TYPE_xattr             },
-       {BKEY_TYPE_xattrs,      129, KEY_TYPE_hash_whiteout     },
-       {BKEY_TYPE_alloc,       128, KEY_TYPE_alloc             },
-       {BKEY_TYPE_quotas,      128, KEY_TYPE_quota             },
-};
-
-void bch2_bkey_renumber(enum btree_node_type btree_node_type,
-                       struct bkey_packed *k,
-                       int write)
-{
-       const struct old_bkey_type *i;
-
-       for (i = bkey_renumber_table;
-            i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
-            i++)
-               if (btree_node_type == i->btree_node_type &&
-                   k->type == (write ? i->new : i->old)) {
-                       k->type = write ? i->old : i->new;
-                       break;
-               }
-}
-
-void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
-                       unsigned version, unsigned big_endian,
-                       int write,
-                       struct bkey_format *f,
-                       struct bkey_packed *k)
-{
-       const struct bkey_ops *ops;
-       struct bkey uk;
-       unsigned nr_compat = 5;
-       int i;
-
-       /*
-        * Do these operations in reverse order in the write path:
-        */
-
-       for (i = 0; i < nr_compat; i++)
-       switch (!write ? i : nr_compat - 1 - i) {
-       case 0:
-               if (big_endian != CPU_BIG_ENDIAN) {
-                       bch2_bkey_swab_key(f, k);
-               } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-                       bch2_bkey_swab_key(f, k);
-                       bch2_bkey_swab_key(f, k);
-               }
-               break;
-       case 1:
-               if (version < bcachefs_metadata_version_bkey_renumber)
-                       bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
-               break;
-       case 2:
-               if (version < bcachefs_metadata_version_inode_btree_change &&
-                   btree_id == BTREE_ID_inodes) {
-                       if (!bkey_packed(k)) {
-                               struct bkey_i *u = packed_to_bkey(k);
-
-                               swap(u->k.p.inode, u->k.p.offset);
-                       } else if (f->bits_per_field[BKEY_FIELD_INODE] &&
-                                  f->bits_per_field[BKEY_FIELD_OFFSET]) {
-                               struct bkey_format tmp = *f, *in = f, *out = &tmp;
-
-                               swap(tmp.bits_per_field[BKEY_FIELD_INODE],
-                                    tmp.bits_per_field[BKEY_FIELD_OFFSET]);
-                               swap(tmp.field_offset[BKEY_FIELD_INODE],
-                                    tmp.field_offset[BKEY_FIELD_OFFSET]);
-
-                               if (!write)
-                                       swap(in, out);
-
-                               uk = __bch2_bkey_unpack_key(in, k);
-                               swap(uk.p.inode, uk.p.offset);
-                               BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
-                       }
-               }
-               break;
-       case 3:
-               if (version < bcachefs_metadata_version_snapshot &&
-                   (level || btree_type_has_snapshots(btree_id))) {
-                       struct bkey_i *u = packed_to_bkey(k);
-
-                       if (u) {
-                               u->k.p.snapshot = write
-                                       ? 0 : U32_MAX;
-                       } else {
-                               u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]);
-                               u64 max_packed = min_packed +
-                                       ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
-
-                               uk = __bch2_bkey_unpack_key(f, k);
-                               uk.p.snapshot = write
-                                       ? min_packed : min_t(u64, U32_MAX, max_packed);
-
-                               BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
-                       }
-               }
-
-               break;
-       case 4: {
-               struct bkey_s u;
-
-               if (!bkey_packed(k)) {
-                       u = bkey_i_to_s(packed_to_bkey(k));
-               } else {
-                       uk = __bch2_bkey_unpack_key(f, k);
-                       u.k = &uk;
-                       u.v = bkeyp_val(f, k);
-               }
-
-               if (big_endian != CPU_BIG_ENDIAN)
-                       bch2_bkey_swab_val(u);
-
-               ops = bch2_bkey_type_ops(k->type);
-
-               if (ops->compat)
-                       ops->compat(btree_id, version, big_endian, write, u);
-               break;
-       }
-       default:
-               BUG();
-       }
-}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
deleted file mode 100644 (file)
index bf34111..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_METHODS_H
-#define _BCACHEFS_BKEY_METHODS_H
-
-#include "bkey.h"
-
-struct bch_fs;
-struct btree;
-struct btree_trans;
-struct bkey;
-enum btree_node_type;
-
-extern const char * const bch2_bkey_types[];
-extern const struct bkey_ops bch2_bkey_null_ops;
-
-/*
- * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If
- * invalid, entire key will be deleted.
- *
- * When invalid, error string is returned via @err. @rw indicates whether key is
- * being read or written; more aggressive checks can be enabled when rw == WRITE.
- */
-struct bkey_ops {
-       int             (*key_validate)(struct bch_fs *c, struct bkey_s_c k,
-                                       struct bkey_validate_context from);
-       void            (*val_to_text)(struct printbuf *, struct bch_fs *,
-                                      struct bkey_s_c);
-       void            (*swab)(struct bkey_s);
-       bool            (*key_normalize)(struct bch_fs *, struct bkey_s);
-       bool            (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-       int             (*trigger)(struct btree_trans *, enum btree_id, unsigned,
-                                  struct bkey_s_c, struct bkey_s,
-                                  enum btree_iter_update_trigger_flags);
-       void            (*compat)(enum btree_id id, unsigned version,
-                                 unsigned big_endian, int write,
-                                 struct bkey_s);
-
-       /* Size of value type when first created: */
-       unsigned        min_val_size;
-};
-
-extern const struct bkey_ops bch2_bkey_ops[];
-
-static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
-{
-       return likely(type < KEY_TYPE_MAX)
-               ? &bch2_bkey_ops[type]
-               : &bch2_bkey_null_ops;
-}
-
-int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c,
-                          struct bkey_validate_context);
-int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
-                        struct bkey_validate_context);
-int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
-                      struct bkey_validate_context);
-int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
-                           struct bkey_validate_context from);
-
-void bch2_bpos_to_text(struct printbuf *, struct bpos);
-void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
-void bch2_val_to_text(struct printbuf *, struct bch_fs *,
-                     struct bkey_s_c);
-void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
-                          struct bkey_s_c);
-
-void bch2_bkey_swab_val(struct bkey_s);
-
-bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
-
-static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
-{
-       return l->type == r->type &&
-               !bversion_cmp(l->bversion, r->bversion) &&
-               bpos_eq(l->p, bkey_start_pos(r));
-}
-
-bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-static inline int bch2_key_trigger(struct btree_trans *trans,
-               enum btree_id btree, unsigned level,
-               struct bkey_s_c old, struct bkey_s new,
-               enum btree_iter_update_trigger_flags flags)
-{
-       const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
-
-       return ops->trigger
-               ? ops->trigger(trans, btree, level, old, new, flags)
-               : 0;
-}
-
-static inline int bch2_key_trigger_old(struct btree_trans *trans,
-                       enum btree_id btree_id, unsigned level,
-                       struct bkey_s_c old,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       struct bkey_i deleted;
-
-       bkey_init(&deleted.k);
-       deleted.k.p = old.k->p;
-
-       return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
-                               BTREE_TRIGGER_overwrite|flags);
-}
-
-static inline int bch2_key_trigger_new(struct btree_trans *trans,
-                       enum btree_id btree_id, unsigned level,
-                       struct bkey_s new,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       struct bkey_i deleted;
-
-       bkey_init(&deleted.k);
-       deleted.k.p = new.k->p;
-
-       return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
-                               BTREE_TRIGGER_insert|flags);
-}
-
-void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
-
-void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
-                       int, struct bkey_format *, struct bkey_packed *);
-
-static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
-                              unsigned version, unsigned big_endian,
-                              int write,
-                              struct bkey_format *f,
-                              struct bkey_packed *k)
-{
-       if (version < bcachefs_metadata_version_current ||
-           big_endian != CPU_BIG_ENDIAN ||
-           IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-               __bch2_bkey_compat(level, btree_id, version,
-                                  big_endian, write, f, k);
-
-}
-
-#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
deleted file mode 100644 (file)
index 4536eb5..0000000
+++ /dev/null
@@ -1,214 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_cmp.h"
-#include "bkey_sort.h"
-#include "bset.h"
-#include "extents.h"
-
-typedef int (*sort_cmp_fn)(const struct btree *,
-                          const struct bkey_packed *,
-                          const struct bkey_packed *);
-
-static inline bool sort_iter_end(struct sort_iter *iter)
-{
-       return !iter->used;
-}
-
-static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
-                                 sort_cmp_fn cmp)
-{
-       unsigned i;
-
-       for (i = from;
-            i + 1 < iter->used &&
-            cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
-            i++)
-               swap(iter->data[i], iter->data[i + 1]);
-}
-
-static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-       unsigned i = iter->used;
-
-       while (i--)
-               sort_iter_sift(iter, i, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
-{
-       return !sort_iter_end(iter) ? iter->data->k : NULL;
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-       struct sort_iter_set *i = iter->data;
-
-       BUG_ON(!iter->used);
-
-       i->k = bkey_p_next(i->k);
-
-       BUG_ON(i->k > i->end);
-
-       if (i->k == i->end)
-               array_remove_item(iter->data, iter->used, 0);
-       else
-               sort_iter_sift(iter, 0, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
-                                                sort_cmp_fn cmp)
-{
-       struct bkey_packed *ret = sort_iter_peek(iter);
-
-       if (ret)
-               sort_iter_advance(iter, cmp);
-
-       return ret;
-}
-
-/*
- * If keys compare equal, compare by pointer order:
- */
-static inline int key_sort_fix_overlapping_cmp(const struct btree *b,
-                                              const struct bkey_packed *l,
-                                              const struct bkey_packed *r)
-{
-       return bch2_bkey_cmp_packed(b, l, r) ?:
-               cmp_int((unsigned long) l, (unsigned long) r);
-}
-
-static inline bool should_drop_next_key(struct sort_iter *iter)
-{
-       /*
-        * key_sort_cmp() ensures that when keys compare equal the older key
-        * comes first; so if l->k compares equal to r->k then l->k is older
-        * and should be dropped.
-        */
-       return iter->used >= 2 &&
-               !bch2_bkey_cmp_packed(iter->b,
-                                iter->data[0].k,
-                                iter->data[1].k);
-}
-
-struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
-                             struct sort_iter *iter)
-{
-       struct bkey_packed *out = dst->start;
-       struct bkey_packed *k;
-       struct btree_nr_keys nr;
-
-       memset(&nr, 0, sizeof(nr));
-
-       sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
-
-       while ((k = sort_iter_peek(iter))) {
-               if (!bkey_deleted(k) &&
-                   !should_drop_next_key(iter)) {
-                       bkey_p_copy(out, k);
-                       btree_keys_account_key_add(&nr, 0, out);
-                       out = bkey_p_next(out);
-               }
-
-               sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
-       }
-
-       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-       return nr;
-}
-
-/* Sort + repack in a new format: */
-struct btree_nr_keys
-bch2_sort_repack(struct bset *dst, struct btree *src,
-                struct btree_node_iter *src_iter,
-                struct bkey_format *out_f,
-                bool filter_whiteouts)
-{
-       struct bkey_format *in_f = &src->format;
-       struct bkey_packed *in, *out = vstruct_last(dst);
-       struct btree_nr_keys nr;
-       bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
-
-       memset(&nr, 0, sizeof(nr));
-
-       while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
-               if (filter_whiteouts && bkey_deleted(in))
-                       continue;
-
-               if (!transform)
-                       bkey_p_copy(out, in);
-               else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
-                                            ? in_f : &bch2_bkey_format_current, in))
-                       out->format = KEY_FORMAT_LOCAL_BTREE;
-               else
-                       bch2_bkey_unpack(src, (void *) out, in);
-
-               out->needs_whiteout = false;
-
-               btree_keys_account_key_add(&nr, 0, out);
-               out = bkey_p_next(out);
-       }
-
-       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-       return nr;
-}
-
-static inline int keep_unwritten_whiteouts_cmp(const struct btree *b,
-                               const struct bkey_packed *l,
-                               const struct bkey_packed *r)
-{
-       return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
-               (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
-               (long) l - (long) r;
-}
-
-#include "btree_update_interior.h"
-
-/*
- * For sorting in the btree node write path: whiteouts not in the unwritten
- * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are
- * dropped if overwritten by real keys:
- */
-unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter)
-{
-       struct bkey_packed *in, *next, *out = dst;
-
-       sort_iter_sort(iter, keep_unwritten_whiteouts_cmp);
-
-       while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) {
-               if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b))
-                       continue;
-
-               if ((next = sort_iter_peek(iter)) &&
-                   !bch2_bkey_cmp_packed_inlined(iter->b, in, next))
-                       continue;
-
-               bkey_p_copy(out, in);
-               out = bkey_p_next(out);
-       }
-
-       return (u64 *) out - (u64 *) dst;
-}
-
-/*
- * Main sort routine for compacting a btree node in memory: we always drop
- * whiteouts because any whiteouts that need to be written are in the unwritten
- * whiteouts area:
- */
-unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter)
-{
-       struct bkey_packed *in, *out = dst;
-
-       sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined);
-
-       while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) {
-               if (bkey_deleted(in))
-                       continue;
-
-               bkey_p_copy(out, in);
-               out = bkey_p_next(out);
-       }
-
-       return (u64 *) out - (u64 *) dst;
-}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
deleted file mode 100644 (file)
index 9be969d..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_SORT_H
-#define _BCACHEFS_BKEY_SORT_H
-
-struct sort_iter {
-       struct btree            *b;
-       unsigned                used;
-       unsigned                size;
-
-       struct sort_iter_set {
-               struct bkey_packed *k, *end;
-       } data[];
-};
-
-static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
-{
-       iter->b = b;
-       iter->used = 0;
-       iter->size = size;
-}
-
-struct sort_iter_stack {
-       struct sort_iter        iter;
-       struct sort_iter_set    sets[MAX_BSETS + 1];
-};
-
-static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
-{
-       sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
-}
-
-static inline void sort_iter_add(struct sort_iter *iter,
-                                struct bkey_packed *k,
-                                struct bkey_packed *end)
-{
-       BUG_ON(iter->used >= iter->size);
-
-       if (k != end)
-               iter->data[iter->used++] = (struct sort_iter_set) { k, end };
-}
-
-struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
-                             struct sort_iter *);
-
-struct btree_nr_keys
-bch2_sort_repack(struct bset *, struct btree *,
-                struct btree_node_iter *,
-                struct bkey_format *, bool);
-
-unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *);
-unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *);
-
-#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
deleted file mode 100644 (file)
index b4f328f..0000000
+++ /dev/null
@@ -1,241 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_TYPES_H
-#define _BCACHEFS_BKEY_TYPES_H
-
-#include "bcachefs_format.h"
-
-/*
- * bkey_i      - bkey with inline value
- * bkey_s      - bkey with split value
- * bkey_s_c    - bkey with split value, const
- */
-
-#define bkey_p_next(_k)                vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
-       return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k)      ((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
-       return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
-       unsigned u64s = BKEY_U64s + val_u64s;
-
-       BUG_ON(u64s > U8_MAX);
-       k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
-       set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k)       ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k)       ((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k)                              \
-       ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
-/* bkey with split value, const */
-struct bkey_s_c {
-       const struct bkey       *k;
-       const struct bch_val    *v;
-};
-
-/* bkey with split value */
-struct bkey_s {
-       union {
-       struct {
-               struct bkey     *k;
-               struct bch_val  *v;
-       };
-       struct bkey_s_c         s_c;
-       };
-};
-
-#define bkey_s_null            ((struct bkey_s)   { .k = NULL })
-#define bkey_s_c_null          ((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err)                ((struct bkey_s)   { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err)      ((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
-       return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
-       return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
-       return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
-       return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...)                                   \
-struct bkey_i_##name {                                                 \
-       union {                                                         \
-               struct bkey             k;                              \
-               struct bkey_i           k_i;                            \
-       };                                                              \
-       struct bch_##name               v;                              \
-};                                                                     \
-                                                                       \
-struct bkey_s_c_##name {                                               \
-       union {                                                         \
-       struct {                                                        \
-               const struct bkey       *k;                             \
-               const struct bch_##name *v;                             \
-       };                                                              \
-       struct bkey_s_c                 s_c;                            \
-       };                                                              \
-};                                                                     \
-                                                                       \
-struct bkey_s_##name {                                                 \
-       union {                                                         \
-       struct {                                                        \
-               struct bkey             *k;                             \
-               struct bch_##name       *v;                             \
-       };                                                              \
-       struct bkey_s_c_##name          c;                              \
-       struct bkey_s                   s;                              \
-       struct bkey_s_c                 s_c;                            \
-       };                                                              \
-};                                                                     \
-                                                                       \
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
-{                                                                      \
-       EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);    \
-       return container_of(&k->k, struct bkey_i_##name, k);            \
-}                                                                      \
-                                                                       \
-static inline const struct bkey_i_##name *                             \
-bkey_i_to_##name##_c(const struct bkey_i *k)                           \
-{                                                                      \
-       EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);    \
-       return container_of(&k->k, struct bkey_i_##name, k);            \
-}                                                                      \
-                                                                       \
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)   \
-{                                                                      \
-       EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);  \
-       return (struct bkey_s_##name) {                                 \
-               .k = k.k,                                               \
-               .v = container_of(k.v, struct bch_##name, v),           \
-       };                                                              \
-}                                                                      \
-                                                                       \
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{                                                                      \
-       EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);  \
-       return (struct bkey_s_c_##name) {                               \
-               .k = k.k,                                               \
-               .v = container_of(k.v, struct bch_##name, v),           \
-       };                                                              \
-}                                                                      \
-                                                                       \
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{                                                                      \
-       return (struct bkey_s_##name) {                                 \
-               .k = &k->k,                                             \
-               .v = &k->v,                                             \
-       };                                                              \
-}                                                                      \
-                                                                       \
-static inline struct bkey_s_c_##name                                   \
-name##_i_to_s_c(const struct bkey_i_##name *k)                         \
-{                                                                      \
-       return (struct bkey_s_c_##name) {                               \
-               .k = &k->k,                                             \
-               .v = &k->v,                                             \
-       };                                                              \
-}                                                                      \
-                                                                       \
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)        \
-{                                                                      \
-       EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);    \
-       return (struct bkey_s_##name) {                                 \
-               .k = &k->k,                                             \
-               .v = container_of(&k->v, struct bch_##name, v),         \
-       };                                                              \
-}                                                                      \
-                                                                       \
-static inline struct bkey_s_c_##name                                   \
-bkey_i_to_s_c_##name(const struct bkey_i *k)                           \
-{                                                                      \
-       EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);    \
-       return (struct bkey_s_c_##name) {                               \
-               .k = &k->k,                                             \
-               .v = container_of(&k->v, struct bch_##name, v),         \
-       };                                                              \
-}                                                                      \
-                                                                       \
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{                                                                      \
-       struct bkey_i_##name *k =                                       \
-               container_of(&_k->k, struct bkey_i_##name, k);          \
-                                                                       \
-       bkey_init(&k->k);                                               \
-       memset(&k->v, 0, sizeof(k->v));                                 \
-       k->k.type = KEY_TYPE_##name;                                    \
-       set_bkey_val_bytes(&k->k, sizeof(k->v));                        \
-                                                                       \
-       return k;                                                       \
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
-enum bch_validate_flags {
-       BCH_VALIDATE_write              = BIT(0),
-       BCH_VALIDATE_commit             = BIT(1),
-       BCH_VALIDATE_silent             = BIT(2),
-};
-
-#define BKEY_VALIDATE_CONTEXTS()       \
-       x(unknown)                      \
-       x(superblock)                   \
-       x(journal)                      \
-       x(btree_root)                   \
-       x(btree_node)                   \
-       x(commit)
-
-struct bkey_validate_context {
-       enum {
-#define x(n)   BKEY_VALIDATE_##n,
-       BKEY_VALIDATE_CONTEXTS()
-#undef x
-       }                       from:8;
-       enum bch_validate_flags flags:8;
-       u8                      level;
-       enum btree_id           btree;
-       bool                    root:1;
-       unsigned                journal_offset;
-       u64                     journal_seq;
-};
-
-#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
deleted file mode 100644 (file)
index 32841f7..0000000
+++ /dev/null
@@ -1,1576 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for working with individual keys, and sorted sets of keys with in a
- * btree node
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "bset.h"
-#include "eytzinger.h"
-#include "trace.h"
-#include "util.h"
-
-#include <linux/unaligned.h>
-#include <linux/console.h>
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
-                                                 struct btree *);
-
-static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
-{
-       unsigned n = ARRAY_SIZE(iter->data);
-
-       while (n && __btree_node_iter_set_end(iter, n - 1))
-               --n;
-
-       return n;
-}
-
-struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
-{
-       return bch2_bkey_to_bset_inlined(b, k);
-}
-
-/*
- * There are never duplicate live keys in the btree - but including keys that
- * have been flagged as deleted (and will be cleaned up later) we _will_ see
- * duplicates.
- *
- * Thus the sort order is: usual key comparison first, but for keys that compare
- * equal the deleted key(s) come first, and the (at most one) live version comes
- * last.
- *
- * The main reason for this is insertion: to handle overwrites, we first iterate
- * over keys that compare equal to our insert key, and then insert immediately
- * prior to the first key greater than the key we're inserting - our insert
- * position will be after all keys that compare equal to our insert key, which
- * by the time we actually do the insert will all be deleted.
- */
-
-void bch2_dump_bset(struct bch_fs *c, struct btree *b,
-                   struct bset *i, unsigned set)
-{
-       struct bkey_packed *_k, *_n;
-       struct bkey uk, n;
-       struct bkey_s_c k;
-       struct printbuf buf = PRINTBUF;
-
-       if (!i->u64s)
-               return;
-
-       for (_k = i->start;
-            _k < vstruct_last(i);
-            _k = _n) {
-               _n = bkey_p_next(_k);
-
-               if (!_k->u64s) {
-                       printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set,
-                              _k->_data - i->_data);
-                       break;
-               }
-
-               k = bkey_disassemble(b, _k, &uk);
-
-               printbuf_reset(&buf);
-               if (c)
-                       bch2_bkey_val_to_text(&buf, c, k);
-               else
-                       bch2_bkey_to_text(&buf, k.k);
-               printk(KERN_ERR "block %u key %5zu: %s\n", set,
-                      _k->_data - i->_data, buf.buf);
-
-               if (_n == vstruct_last(i))
-                       continue;
-
-               n = bkey_unpack_key(b, _n);
-
-               if (bpos_lt(n.p, k.k->p)) {
-                       printk(KERN_ERR "Key skipped backwards\n");
-                       continue;
-               }
-
-               if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
-                       printk(KERN_ERR "Duplicate keys\n");
-       }
-
-       printbuf_exit(&buf);
-}
-
-void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
-{
-       console_lock();
-       for_each_bset(b, t)
-               bch2_dump_bset(c, b, bset(b, t), t - b->set);
-       console_unlock();
-}
-
-void bch2_dump_btree_node_iter(struct btree *b,
-                             struct btree_node_iter *iter)
-{
-       struct btree_node_iter_set *set;
-       struct printbuf buf = PRINTBUF;
-
-       printk(KERN_ERR "btree node iter with %u/%u sets:\n",
-              __btree_node_iter_used(iter), b->nsets);
-
-       btree_node_iter_for_each(iter, set) {
-               struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
-               struct bset_tree *t = bch2_bkey_to_bset(b, k);
-               struct bkey uk = bkey_unpack_key(b, k);
-
-               printbuf_reset(&buf);
-               bch2_bkey_to_text(&buf, &uk);
-               printk(KERN_ERR "set %zu key %u: %s\n",
-                      t - b->set, set->k, buf.buf);
-       }
-
-       printbuf_exit(&buf);
-}
-
-struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
-{
-       struct bkey_packed *k;
-       struct btree_nr_keys nr = {};
-
-       for_each_bset(b, t)
-               bset_tree_for_each_key(b, t, k)
-                       if (!bkey_deleted(k))
-                               btree_keys_account_key_add(&nr, t - b->set, k);
-       return nr;
-}
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
-{
-       struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
-
-       BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
-}
-
-static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
-                                           struct btree *b)
-{
-       struct btree_node_iter iter = *_iter;
-       const struct bkey_packed *k, *n;
-
-       k = bch2_btree_node_iter_peek_all(&iter, b);
-       __bch2_btree_node_iter_advance(&iter, b);
-       n = bch2_btree_node_iter_peek_all(&iter, b);
-
-       bkey_unpack_key(b, k);
-
-       if (n &&
-           bkey_iter_cmp(b, k, n) > 0) {
-               struct btree_node_iter_set *set;
-               struct bkey ku = bkey_unpack_key(b, k);
-               struct bkey nu = bkey_unpack_key(b, n);
-               struct printbuf buf1 = PRINTBUF;
-               struct printbuf buf2 = PRINTBUF;
-
-               bch2_dump_btree_node(NULL, b);
-               bch2_bkey_to_text(&buf1, &ku);
-               bch2_bkey_to_text(&buf2, &nu);
-               printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
-                      buf1.buf, buf2.buf);
-               printk(KERN_ERR "iter was:");
-
-               btree_node_iter_for_each(_iter, set) {
-                       struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
-                       struct bset_tree *t = bch2_bkey_to_bset(b, k2);
-                       printk(" [%zi %zi]", t - b->set,
-                              k2->_data - bset(b, t)->_data);
-               }
-               panic("\n");
-       }
-}
-
-void __bch2_btree_node_iter_verify(struct btree_node_iter *iter,
-                                  struct btree *b)
-{
-       struct btree_node_iter_set *set, *s2;
-       struct bkey_packed *k, *p;
-
-       if (bch2_btree_node_iter_end(iter))
-               return;
-
-       /* Verify no duplicates: */
-       btree_node_iter_for_each(iter, set) {
-               BUG_ON(set->k > set->end);
-               btree_node_iter_for_each(iter, s2)
-                       BUG_ON(set != s2 && set->end == s2->end);
-       }
-
-       /* Verify that set->end is correct: */
-       btree_node_iter_for_each(iter, set) {
-               for_each_bset(b, t)
-                       if (set->end == t->end_offset) {
-                               BUG_ON(set->k < btree_bkey_first_offset(t) ||
-                                      set->k >= t->end_offset);
-                               goto found;
-                       }
-               BUG();
-found:
-               do {} while (0);
-       }
-
-       /* Verify iterator is sorted: */
-       btree_node_iter_for_each(iter, set)
-               BUG_ON(set != iter->data &&
-                      btree_node_iter_cmp(b, set[-1], set[0]) > 0);
-
-       k = bch2_btree_node_iter_peek_all(iter, b);
-
-       for_each_bset(b, t) {
-               if (iter->data[0].end == t->end_offset)
-                       continue;
-
-               p = bch2_bkey_prev_all(b, t,
-                       bch2_btree_node_iter_bset_pos(iter, b, t));
-
-               BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
-       }
-}
-
-static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
-                                    struct bkey_packed *insert, unsigned clobber_u64s)
-{
-       struct bset_tree *t = bch2_bkey_to_bset(b, where);
-       struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
-       struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
-       struct printbuf buf1 = PRINTBUF;
-       struct printbuf buf2 = PRINTBUF;
-#if 0
-       BUG_ON(prev &&
-              bkey_iter_cmp(b, prev, insert) > 0);
-#else
-       if (prev &&
-           bkey_iter_cmp(b, prev, insert) > 0) {
-               struct bkey k1 = bkey_unpack_key(b, prev);
-               struct bkey k2 = bkey_unpack_key(b, insert);
-
-               bch2_dump_btree_node(NULL, b);
-               bch2_bkey_to_text(&buf1, &k1);
-               bch2_bkey_to_text(&buf2, &k2);
-
-               panic("prev > insert:\n"
-                     "prev    key %s\n"
-                     "insert  key %s\n",
-                     buf1.buf, buf2.buf);
-       }
-#endif
-#if 0
-       BUG_ON(next != btree_bkey_last(b, t) &&
-              bkey_iter_cmp(b, insert, next) > 0);
-#else
-       if (next != btree_bkey_last(b, t) &&
-           bkey_iter_cmp(b, insert, next) > 0) {
-               struct bkey k1 = bkey_unpack_key(b, insert);
-               struct bkey k2 = bkey_unpack_key(b, next);
-
-               bch2_dump_btree_node(NULL, b);
-               bch2_bkey_to_text(&buf1, &k1);
-               bch2_bkey_to_text(&buf2, &k2);
-
-               panic("insert > next:\n"
-                     "insert  key %s\n"
-                     "next    key %s\n",
-                     buf1.buf, buf2.buf);
-       }
-#endif
-}
-
-static inline void bch2_verify_insert_pos(struct btree *b,
-                                         struct bkey_packed *where,
-                                         struct bkey_packed *insert,
-                                         unsigned clobber_u64s)
-{
-       if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
-               __bch2_verify_insert_pos(b, where, insert, clobber_u64s);
-}
-
-
-/* Auxiliary search trees */
-
-#define BFLOAT_FAILED_UNPACKED U8_MAX
-#define BFLOAT_FAILED          U8_MAX
-
-struct bkey_float {
-       u8              exponent;
-       u8              key_offset;
-       u16             mantissa;
-};
-#define BKEY_MANTISSA_BITS     16
-
-struct ro_aux_tree {
-       u8                      nothing[0];
-       struct bkey_float       f[];
-};
-
-struct rw_aux_tree {
-       u16             offset;
-       struct bpos     k;
-};
-
-static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
-{
-       BUG_ON(t->aux_data_offset == U16_MAX);
-
-       switch (bset_aux_tree_type(t)) {
-       case BSET_NO_AUX_TREE:
-               return t->aux_data_offset;
-       case BSET_RO_AUX_TREE:
-               return t->aux_data_offset +
-                       DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
-       case BSET_RW_AUX_TREE:
-               return t->aux_data_offset +
-                       DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
-       default:
-               BUG();
-       }
-}
-
-static unsigned bset_aux_tree_buf_start(const struct btree *b,
-                                       const struct bset_tree *t)
-{
-       return t == b->set
-               ? DIV_ROUND_UP(b->unpack_fn_len, 8)
-               : bset_aux_tree_buf_end(t - 1);
-}
-
-static void *__aux_tree_base(const struct btree *b,
-                            const struct bset_tree *t)
-{
-       return b->aux_data + t->aux_data_offset * 8;
-}
-
-static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
-                                           const struct bset_tree *t)
-{
-       EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
-       return __aux_tree_base(b, t);
-}
-
-static struct bkey_float *bkey_float(const struct btree *b,
-                                    const struct bset_tree *t,
-                                    unsigned idx)
-{
-       return ro_aux_tree_base(b, t)->f + idx;
-}
-
-static void __bset_aux_tree_verify(struct btree *b)
-{
-       for_each_bset(b, t) {
-               if (t->aux_data_offset == U16_MAX)
-                       continue;
-
-               BUG_ON(t != b->set &&
-                      t[-1].aux_data_offset == U16_MAX);
-
-               BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
-               BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
-               BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
-       }
-}
-
-static inline void bset_aux_tree_verify(struct btree *b)
-{
-       if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
-               __bset_aux_tree_verify(b);
-}
-
-void bch2_btree_keys_init(struct btree *b)
-{
-       unsigned i;
-
-       b->nsets                = 0;
-       memset(&b->nr, 0, sizeof(b->nr));
-
-       for (i = 0; i < MAX_BSETS; i++)
-               b->set[i].data_offset = U16_MAX;
-
-       bch2_bset_set_no_aux_tree(b, b->set);
-}
-
-/* Binary tree stuff for auxiliary search trees */
-
-/*
- * Cacheline/offset <-> bkey pointer arithmetic:
- *
- * t->tree is a binary search tree in an array; each node corresponds to a key
- * in one cacheline in t->set (BSET_CACHELINE bytes).
- *
- * This means we don't have to store the full index of the key that a node in
- * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
- * then bkey_float->m gives us the offset within that cacheline, in units of 8
- * bytes.
- *
- * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
- * make this work.
- *
- * To construct the bfloat for an arbitrary key we need to know what the key
- * immediately preceding it is: we have to check if the two keys differ in the
- * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
- * of the previous key so we can walk backwards to it from t->tree[j]'s key.
- */
-
-static inline void *bset_cacheline(const struct btree *b,
-                                  const struct bset_tree *t,
-                                  unsigned cacheline)
-{
-       return (void *) round_down((unsigned long) btree_bkey_first(b, t),
-                                  L1_CACHE_BYTES) +
-               cacheline * BSET_CACHELINE;
-}
-
-static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
-                                            const struct bset_tree *t,
-                                            unsigned cacheline,
-                                            unsigned offset)
-{
-       return bset_cacheline(b, t, cacheline) + offset * 8;
-}
-
-static unsigned bkey_to_cacheline(const struct btree *b,
-                                 const struct bset_tree *t,
-                                 const struct bkey_packed *k)
-{
-       return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
-}
-
-static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
-                                         const struct bset_tree *t,
-                                         unsigned cacheline,
-                                         const struct bkey_packed *k)
-{
-       return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
-}
-
-static unsigned bkey_to_cacheline_offset(const struct btree *b,
-                                        const struct bset_tree *t,
-                                        unsigned cacheline,
-                                        const struct bkey_packed *k)
-{
-       size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
-
-       EBUG_ON(m > U8_MAX);
-       return m;
-}
-
-static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
-                                              const struct bset_tree *t,
-                                              unsigned j)
-{
-       return cacheline_to_bkey(b, t,
-                       __eytzinger1_to_inorder(j, t->size - 1, t->extra),
-                       bkey_float(b, t, j)->key_offset);
-}
-
-static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
-                                      const struct bset_tree *t)
-{
-       EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
-
-       return __aux_tree_base(b, t);
-}
-
-/*
- * For the write set - the one we're currently inserting keys into - we don't
- * maintain a full search tree, we just keep a simple lookup table in t->prev.
- */
-static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
-                                         struct bset_tree *t,
-                                         unsigned j)
-{
-       return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
-}
-
-static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
-                           unsigned j, struct bkey_packed *k)
-{
-       EBUG_ON(k >= btree_bkey_last(b, t));
-
-       rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
-               .offset = __btree_node_key_to_offset(b, k),
-               .k      = bkey_unpack_pos(b, k),
-       };
-}
-
-static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t)
-{
-       struct bkey_packed *k = btree_bkey_first(b, t);
-       unsigned j = 0;
-
-       BUG_ON(bset_has_ro_aux_tree(t));
-
-       if (!bset_has_rw_aux_tree(t))
-               return;
-
-       BUG_ON(t->size < 1);
-       BUG_ON(rw_aux_to_bkey(b, t, j) != k);
-
-       goto start;
-       while (1) {
-               if (rw_aux_to_bkey(b, t, j) == k) {
-                       BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
-                                       bkey_unpack_pos(b, k)));
-start:
-                       if (++j == t->size)
-                               break;
-
-                       BUG_ON(rw_aux_tree(b, t)[j].offset <=
-                              rw_aux_tree(b, t)[j - 1].offset);
-               }
-
-               k = bkey_p_next(k);
-               BUG_ON(k >= btree_bkey_last(b, t));
-       }
-}
-
-static inline void bch2_bset_verify_rw_aux_tree(struct btree *b,
-                                               struct bset_tree *t)
-{
-       if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
-               __bch2_bset_verify_rw_aux_tree(b, t);
-}
-
-/* returns idx of first entry >= offset: */
-static unsigned rw_aux_tree_bsearch(struct btree *b,
-                                   struct bset_tree *t,
-                                   unsigned offset)
-{
-       unsigned bset_offs = offset - btree_bkey_first_offset(t);
-       unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
-       unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
-
-       EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
-       EBUG_ON(!t->size);
-       EBUG_ON(idx > t->size);
-
-       while (idx < t->size &&
-              rw_aux_tree(b, t)[idx].offset < offset)
-               idx++;
-
-       while (idx &&
-              rw_aux_tree(b, t)[idx - 1].offset >= offset)
-               idx--;
-
-       EBUG_ON(idx < t->size &&
-               rw_aux_tree(b, t)[idx].offset < offset);
-       EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
-       EBUG_ON(idx + 1 < t->size &&
-               rw_aux_tree(b, t)[idx].offset ==
-               rw_aux_tree(b, t)[idx + 1].offset);
-
-       return idx;
-}
-
-static inline unsigned bkey_mantissa(const struct bkey_packed *k,
-                                    const struct bkey_float *f)
-{
-       u64 v;
-
-       EBUG_ON(!bkey_packed(k));
-
-       v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
-
-       /*
-        * In little endian, we're shifting off low bits (and then the bits we
-        * want are at the low end), in big endian we're shifting off high bits
-        * (and then the bits we want are at the high end, so we shift them
-        * back down):
-        */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-       v >>= f->exponent & 7;
-#else
-       v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
-#endif
-       return (u16) v;
-}
-
-static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
-                                       unsigned j,
-                                       struct bkey_packed *min_key,
-                                       struct bkey_packed *max_key)
-{
-       struct bkey_float *f = bkey_float(b, t, j);
-       struct bkey_packed *m = tree_to_bkey(b, t, j);
-       struct bkey_packed *l = is_power_of_2(j)
-               ? min_key
-               : tree_to_bkey(b, t, j >> ffs(j));
-       struct bkey_packed *r = is_power_of_2(j + 1)
-               ? max_key
-               : tree_to_bkey(b, t, j >> (ffz(j) + 1));
-       unsigned mantissa;
-       int shift, exponent, high_bit;
-
-       /*
-        * for failed bfloats, the lookup code falls back to comparing against
-        * the original key.
-        */
-
-       if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
-           !b->nr_key_bits) {
-               f->exponent = BFLOAT_FAILED_UNPACKED;
-               return;
-       }
-
-       /*
-        * The greatest differing bit of l and r is the first bit we must
-        * include in the bfloat mantissa we're creating in order to do
-        * comparisons - that bit always becomes the high bit of
-        * bfloat->mantissa, and thus the exponent we're calculating here is
-        * the position of what will become the low bit in bfloat->mantissa:
-        *
-        * Note that this may be negative - we may be running off the low end
-        * of the key: we handle this later:
-        */
-       high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
-                      min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
-       exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
-
-       /*
-        * Then we calculate the actual shift value, from the start of the key
-        * (k->_data), to get the key bits starting at exponent:
-        */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-       shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
-
-       EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
-#else
-       shift = high_bit_offset +
-               b->nr_key_bits -
-               exponent -
-               BKEY_MANTISSA_BITS;
-
-       EBUG_ON(shift < KEY_PACKED_BITS_START);
-#endif
-       EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
-
-       f->exponent = shift;
-       mantissa = bkey_mantissa(m, f);
-
-       /*
-        * If we've got garbage bits, set them to all 1s - it's legal for the
-        * bfloat to compare larger than the original key, but not smaller:
-        */
-       if (exponent < 0)
-               mantissa |= ~(~0U << -exponent);
-
-       f->mantissa = mantissa;
-}
-
-/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
-       bset_aux_tree_verify(b);
-
-       return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
-}
-
-static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
-       return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
-}
-
-static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
-       return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
-}
-
-static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
-{
-       struct bkey_packed *k;
-
-       t->size = 1;
-       t->extra = BSET_RW_AUX_TREE_VAL;
-       rw_aux_tree(b, t)[0].offset =
-               __btree_node_key_to_offset(b, btree_bkey_first(b, t));
-
-       bset_tree_for_each_key(b, t, k) {
-               if (t->size == bset_rw_tree_capacity(b, t))
-                       break;
-
-               if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
-                   L1_CACHE_BYTES)
-                       rw_aux_tree_set(b, t, t->size++, k);
-       }
-}
-
-static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
-{
-       struct bkey_packed *k = btree_bkey_first(b, t);
-       struct bkey_i min_key, max_key;
-       unsigned cacheline = 1;
-
-       t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
-                     bset_ro_tree_capacity(b, t));
-retry:
-       if (t->size < 2) {
-               t->size = 0;
-               t->extra = BSET_NO_AUX_TREE_VAL;
-               return;
-       }
-
-       t->extra = eytzinger1_extra(t->size - 1);
-
-       /* First we figure out where the first key in each cacheline is */
-       eytzinger1_for_each(j, t->size - 1) {
-               while (bkey_to_cacheline(b, t, k) < cacheline)
-                       k = bkey_p_next(k);
-
-               if (k >= btree_bkey_last(b, t)) {
-                       /* XXX: this path sucks */
-                       t->size--;
-                       goto retry;
-               }
-
-               bkey_float(b, t, j)->key_offset =
-                       bkey_to_cacheline_offset(b, t, cacheline++, k);
-
-               EBUG_ON(tree_to_bkey(b, t, j) != k);
-       }
-
-       if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
-               bkey_init(&min_key.k);
-               min_key.k.p = b->data->min_key;
-       }
-
-       if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
-               bkey_init(&max_key.k);
-               max_key.k.p = b->data->max_key;
-       }
-
-       /* Then we build the tree */
-       eytzinger1_for_each(j, t->size - 1)
-               make_bfloat(b, t, j,
-                           bkey_to_packed(&min_key),
-                           bkey_to_packed(&max_key));
-}
-
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
-{
-       struct bset_tree *i;
-
-       for (i = b->set; i != t; i++)
-               BUG_ON(bset_has_rw_aux_tree(i));
-
-       bch2_bset_set_no_aux_tree(b, t);
-
-       /* round up to next cacheline: */
-       t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
-                                     SMP_CACHE_BYTES / sizeof(u64));
-
-       bset_aux_tree_verify(b);
-}
-
-void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
-                            bool writeable)
-{
-       if (writeable
-           ? bset_has_rw_aux_tree(t)
-           : bset_has_ro_aux_tree(t))
-               return;
-
-       bset_alloc_tree(b, t);
-
-       if (!__bset_tree_capacity(b, t))
-               return;
-
-       if (writeable)
-               __build_rw_aux_tree(b, t);
-       else
-               __build_ro_aux_tree(b, t);
-
-       bset_aux_tree_verify(b);
-}
-
-void bch2_bset_init_first(struct btree *b, struct bset *i)
-{
-       struct bset_tree *t;
-
-       BUG_ON(b->nsets);
-
-       memset(i, 0, sizeof(*i));
-       get_random_bytes(&i->seq, sizeof(i->seq));
-       SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
-       t = &b->set[b->nsets++];
-       set_btree_bset(b, t, i);
-}
-
-void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
-{
-       struct bset *i = &bne->keys;
-       struct bset_tree *t;
-
-       BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
-       BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
-       BUG_ON(b->nsets >= MAX_BSETS);
-
-       memset(i, 0, sizeof(*i));
-       i->seq = btree_bset_first(b)->seq;
-       SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
-       t = &b->set[b->nsets++];
-       set_btree_bset(b, t, i);
-}
-
-/*
- * find _some_ key in the same bset as @k that precedes @k - not necessarily the
- * immediate predecessor:
- */
-static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
-                                      struct bkey_packed *k)
-{
-       struct bkey_packed *p;
-       unsigned offset;
-       int j;
-
-       EBUG_ON(k < btree_bkey_first(b, t) ||
-               k > btree_bkey_last(b, t));
-
-       if (k == btree_bkey_first(b, t))
-               return NULL;
-
-       switch (bset_aux_tree_type(t)) {
-       case BSET_NO_AUX_TREE:
-               p = btree_bkey_first(b, t);
-               break;
-       case BSET_RO_AUX_TREE:
-               j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
-
-               do {
-                       p = j ? tree_to_bkey(b, t,
-                                       __inorder_to_eytzinger1(j--,
-                                                       t->size - 1, t->extra))
-                             : btree_bkey_first(b, t);
-               } while (p >= k);
-               break;
-       case BSET_RW_AUX_TREE:
-               offset = __btree_node_key_to_offset(b, k);
-               j = rw_aux_tree_bsearch(b, t, offset);
-               p = j ? rw_aux_to_bkey(b, t, j - 1)
-                     : btree_bkey_first(b, t);
-               break;
-       }
-
-       return p;
-}
-
-struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
-                                         struct bset_tree *t,
-                                         struct bkey_packed *k,
-                                         unsigned min_key_type)
-{
-       struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
-
-       while ((p = __bkey_prev(b, t, k)) && !ret) {
-               for (i = p; i != k; i = bkey_p_next(i))
-                       if (i->type >= min_key_type)
-                               ret = i;
-
-               k = p;
-       }
-
-       if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
-               BUG_ON(ret >= orig_k);
-
-               for (i = ret
-                       ? bkey_p_next(ret)
-                       : btree_bkey_first(b, t);
-                    i != orig_k;
-                    i = bkey_p_next(i))
-                       BUG_ON(i->type >= min_key_type);
-       }
-
-       return ret;
-}
-
-/* Insert */
-
-static void rw_aux_tree_insert_entry(struct btree *b,
-                                    struct bset_tree *t,
-                                    unsigned idx)
-{
-       EBUG_ON(!idx || idx > t->size);
-       struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
-       struct bkey_packed *end = idx < t->size
-                                 ? rw_aux_to_bkey(b, t, idx)
-                                 : btree_bkey_last(b, t);
-
-       if (t->size < bset_rw_tree_capacity(b, t) &&
-           (void *) end - (void *) start > L1_CACHE_BYTES) {
-               struct bkey_packed *k = start;
-
-               while (1) {
-                       k = bkey_p_next(k);
-                       if (k == end)
-                               break;
-
-                       if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
-                               memmove(&rw_aux_tree(b, t)[idx + 1],
-                                       &rw_aux_tree(b, t)[idx],
-                                       (void *) &rw_aux_tree(b, t)[t->size] -
-                                       (void *) &rw_aux_tree(b, t)[idx]);
-                               t->size++;
-                               rw_aux_tree_set(b, t, idx, k);
-                               break;
-                       }
-               }
-       }
-}
-
-static void bch2_bset_fix_lookup_table(struct btree *b,
-                                      struct bset_tree *t,
-                                      struct bkey_packed *_where,
-                                      unsigned clobber_u64s,
-                                      unsigned new_u64s)
-{
-       int shift = new_u64s - clobber_u64s;
-       unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
-
-       EBUG_ON(bset_has_ro_aux_tree(t));
-
-       if (!bset_has_rw_aux_tree(t))
-               return;
-
-       if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
-               rw_aux_tree_insert_entry(b, t, t->size);
-               goto verify;
-       }
-
-       /* returns first entry >= where */
-       idx = rw_aux_tree_bsearch(b, t, where);
-
-       if (rw_aux_tree(b, t)[idx].offset == where) {
-               if (!idx) { /* never delete first entry */
-                       idx++;
-               } else if (where < t->end_offset) {
-                       rw_aux_tree_set(b, t, idx++, _where);
-               } else {
-                       EBUG_ON(where != t->end_offset);
-                       rw_aux_tree_insert_entry(b, t, --t->size);
-                       goto verify;
-               }
-       }
-
-       EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
-       if (idx < t->size &&
-           rw_aux_tree(b, t)[idx].offset + shift ==
-           rw_aux_tree(b, t)[idx - 1].offset) {
-               memmove(&rw_aux_tree(b, t)[idx],
-                       &rw_aux_tree(b, t)[idx + 1],
-                       (void *) &rw_aux_tree(b, t)[t->size] -
-                       (void *) &rw_aux_tree(b, t)[idx + 1]);
-               t->size -= 1;
-       }
-
-       for (j = idx; j < t->size; j++)
-               rw_aux_tree(b, t)[j].offset += shift;
-
-       EBUG_ON(idx < t->size &&
-               rw_aux_tree(b, t)[idx].offset ==
-               rw_aux_tree(b, t)[idx - 1].offset);
-
-       rw_aux_tree_insert_entry(b, t, idx);
-
-verify:
-       bch2_bset_verify_rw_aux_tree(b, t);
-       bset_aux_tree_verify(b);
-}
-
-void bch2_bset_insert(struct btree *b,
-                     struct bkey_packed *where,
-                     struct bkey_i *insert,
-                     unsigned clobber_u64s)
-{
-       struct bkey_format *f = &b->format;
-       struct bset_tree *t = bset_tree_last(b);
-       struct bkey_packed packed, *src = bkey_to_packed(insert);
-
-       bch2_bset_verify_rw_aux_tree(b, t);
-       bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
-
-       if (bch2_bkey_pack_key(&packed, &insert->k, f))
-               src = &packed;
-
-       if (!bkey_deleted(&insert->k))
-               btree_keys_account_key_add(&b->nr, t - b->set, src);
-
-       if (src->u64s != clobber_u64s) {
-               u64 *src_p = (u64 *) where->_data + clobber_u64s;
-               u64 *dst_p = (u64 *) where->_data + src->u64s;
-
-               EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
-                       (int) clobber_u64s - src->u64s);
-
-               memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
-               le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
-               set_btree_bset_end(b, t);
-       }
-
-       memcpy_u64s_small(where, src,
-                   bkeyp_key_u64s(f, src));
-       memcpy_u64s(bkeyp_val(f, where), &insert->v,
-                   bkeyp_val_u64s(f, src));
-
-       if (src->u64s != clobber_u64s)
-               bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
-
-       bch2_verify_btree_nr_keys(b);
-}
-
-void bch2_bset_delete(struct btree *b,
-                     struct bkey_packed *where,
-                     unsigned clobber_u64s)
-{
-       struct bset_tree *t = bset_tree_last(b);
-       u64 *src_p = (u64 *) where->_data + clobber_u64s;
-       u64 *dst_p = where->_data;
-
-       bch2_bset_verify_rw_aux_tree(b, t);
-
-       EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
-
-       memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
-       le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
-       set_btree_bset_end(b, t);
-
-       bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
-}
-
-/* Lookup */
-
-__flatten
-static struct bkey_packed *bset_search_write_set(const struct btree *b,
-                               struct bset_tree *t,
-                               struct bpos *search)
-{
-       unsigned l = 0, r = t->size;
-
-       while (l + 1 != r) {
-               unsigned m = (l + r) >> 1;
-
-               if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
-                       l = m;
-               else
-                       r = m;
-       }
-
-       return rw_aux_to_bkey(b, t, l);
-}
-
-static inline void prefetch_four_cachelines(void *p)
-{
-#ifdef CONFIG_X86_64
-       asm("prefetcht0 (-127 + 64 * 0)(%0);"
-           "prefetcht0 (-127 + 64 * 1)(%0);"
-           "prefetcht0 (-127 + 64 * 2)(%0);"
-           "prefetcht0 (-127 + 64 * 3)(%0);"
-           :
-           : "r" (p + 127));
-#else
-       prefetch(p + L1_CACHE_BYTES * 0);
-       prefetch(p + L1_CACHE_BYTES * 1);
-       prefetch(p + L1_CACHE_BYTES * 2);
-       prefetch(p + L1_CACHE_BYTES * 3);
-#endif
-}
-
-static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
-                                             const struct bkey_float *f)
-{
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-       unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
-
-       return f->exponent > key_bits_start;
-#else
-       unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
-
-       return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
-#endif
-}
-
-__flatten
-static struct bkey_packed *bset_search_tree(const struct btree *b,
-                               const struct bset_tree *t,
-                               const struct bpos *search,
-                               const struct bkey_packed *packed_search)
-{
-       struct ro_aux_tree *base = ro_aux_tree_base(b, t);
-       struct bkey_float *f;
-       struct bkey_packed *k;
-       unsigned inorder, n = 1, l, r;
-       int cmp;
-
-       do {
-               if (likely(n << 4 < t->size))
-                       prefetch(&base->f[n << 4]);
-
-               f = &base->f[n];
-               if (unlikely(f->exponent >= BFLOAT_FAILED))
-                       goto slowpath;
-
-               l = f->mantissa;
-               r = bkey_mantissa(packed_search, f);
-
-               if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
-                       goto slowpath;
-
-               n = n * 2 + (l < r);
-               continue;
-slowpath:
-               k = tree_to_bkey(b, t, n);
-               cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
-               if (!cmp)
-                       return k;
-
-               n = n * 2 + (cmp < 0);
-       } while (n < t->size);
-
-       inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
-
-       /*
-        * n would have been the node we recursed to - the low bit tells us if
-        * we recursed left or recursed right.
-        */
-       if (likely(!(n & 1))) {
-               --inorder;
-               if (unlikely(!inorder))
-                       return btree_bkey_first(b, t);
-
-               f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
-       }
-
-       return cacheline_to_bkey(b, t, inorder, f->key_offset);
-}
-
-static __always_inline __flatten
-struct bkey_packed *__bch2_bset_search(struct btree *b,
-                               struct bset_tree *t,
-                               struct bpos *search,
-                               const struct bkey_packed *lossy_packed_search)
-{
-
-       /*
-        * First, we search for a cacheline, then lastly we do a linear search
-        * within that cacheline.
-        *
-        * To search for the cacheline, there's three different possibilities:
-        *  * The set is too small to have a search tree, so we just do a linear
-        *    search over the whole set.
-        *  * The set is the one we're currently inserting into; keeping a full
-        *    auxiliary search tree up to date would be too expensive, so we
-        *    use a much simpler lookup table to do a binary search -
-        *    bset_search_write_set().
-        *  * Or we use the auxiliary search tree we constructed earlier -
-        *    bset_search_tree()
-        */
-
-       switch (bset_aux_tree_type(t)) {
-       case BSET_NO_AUX_TREE:
-               return btree_bkey_first(b, t);
-       case BSET_RW_AUX_TREE:
-               return bset_search_write_set(b, t, search);
-       case BSET_RO_AUX_TREE:
-               return bset_search_tree(b, t, search, lossy_packed_search);
-       default:
-               BUG();
-       }
-}
-
-static __always_inline __flatten
-struct bkey_packed *bch2_bset_search_linear(struct btree *b,
-                               struct bset_tree *t,
-                               struct bpos *search,
-                               struct bkey_packed *packed_search,
-                               const struct bkey_packed *lossy_packed_search,
-                               struct bkey_packed *m)
-{
-       if (lossy_packed_search)
-               while (m != btree_bkey_last(b, t) &&
-                      bkey_iter_cmp_p_or_unp(b, m,
-                                       lossy_packed_search, search) < 0)
-                       m = bkey_p_next(m);
-
-       if (!packed_search)
-               while (m != btree_bkey_last(b, t) &&
-                      bkey_iter_pos_cmp(b, m, search) < 0)
-                       m = bkey_p_next(m);
-
-       if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
-               struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
-
-               BUG_ON(prev &&
-                      bkey_iter_cmp_p_or_unp(b, prev,
-                                       packed_search, search) >= 0);
-       }
-
-       return m;
-}
-
-/* Btree node iterator */
-
-static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
-                             struct btree *b,
-                             const struct bkey_packed *k,
-                             const struct bkey_packed *end)
-{
-       if (k != end) {
-               struct btree_node_iter_set *pos;
-
-               btree_node_iter_for_each(iter, pos)
-                       ;
-
-               BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
-               *pos = (struct btree_node_iter_set) {
-                       __btree_node_key_to_offset(b, k),
-                       __btree_node_key_to_offset(b, end)
-               };
-       }
-}
-
-void bch2_btree_node_iter_push(struct btree_node_iter *iter,
-                              struct btree *b,
-                              const struct bkey_packed *k,
-                              const struct bkey_packed *end)
-{
-       __bch2_btree_node_iter_push(iter, b, k, end);
-       bch2_btree_node_iter_sort(iter, b);
-}
-
-noinline __flatten __cold
-static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
-                             struct btree *b, struct bpos *search)
-{
-       struct bkey_packed *k;
-
-       trace_bkey_pack_pos_fail(search);
-
-       bch2_btree_node_iter_init_from_start(iter, b);
-
-       while ((k = bch2_btree_node_iter_peek(iter, b)) &&
-              bkey_iter_pos_cmp(b, k, search) < 0)
-               bch2_btree_node_iter_advance(iter, b);
-}
-
-/**
- * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
- * given position
- *
- * @iter:      iterator to initialize
- * @b:         btree node to search
- * @search:    search key
- *
- * Main entry point to the lookup code for individual btree nodes:
- *
- * NOTE:
- *
- * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
- * keys. This doesn't matter for most code, but it does matter for lookups.
- *
- * Some adjacent keys with a string of equal keys:
- *     i j k k k k l m
- *
- * If you search for k, the lookup code isn't guaranteed to return you any
- * specific k. The lookup code is conceptually doing a binary search and
- * iterating backwards is very expensive so if the pivot happens to land at the
- * last k that's what you'll get.
- *
- * This works out ok, but it's something to be aware of:
- *
- *  - For non extents, we guarantee that the live key comes last - see
- *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
- *    see will only be deleted keys you don't care about.
- *
- *  - For extents, deleted keys sort last (see the comment at the top of this
- *    file). But when you're searching for extents, you actually want the first
- *    key strictly greater than your search key - an extent that compares equal
- *    to the search key is going to have 0 sectors after the search key.
- *
- *    But this does mean that we can't just search for
- *    bpos_successor(start_of_range) to get the first extent that overlaps with
- *    the range we want - if we're unlucky and there's an extent that ends
- *    exactly where we searched, then there could be a deleted key at the same
- *    position and we'd get that when we search instead of the preceding extent
- *    we needed.
- *
- *    So we've got to search for start_of_range, then after the lookup iterate
- *    past any extents that compare equal to the position we searched for.
- */
-__flatten
-void bch2_btree_node_iter_init(struct btree_node_iter *iter,
-                              struct btree *b, struct bpos *search)
-{
-       struct bkey_packed p, *packed_search = NULL;
-       struct btree_node_iter_set *pos = iter->data;
-       struct bkey_packed *k[MAX_BSETS];
-       unsigned i;
-
-       EBUG_ON(bpos_lt(*search, b->data->min_key));
-       EBUG_ON(bpos_gt(*search, b->data->max_key));
-       bset_aux_tree_verify(b);
-
-       memset(iter, 0, sizeof(*iter));
-
-       switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
-       case BKEY_PACK_POS_EXACT:
-               packed_search = &p;
-               break;
-       case BKEY_PACK_POS_SMALLER:
-               packed_search = NULL;
-               break;
-       case BKEY_PACK_POS_FAIL:
-               btree_node_iter_init_pack_failed(iter, b, search);
-               return;
-       }
-
-       for (i = 0; i < b->nsets; i++) {
-               k[i] = __bch2_bset_search(b, b->set + i, search, &p);
-               prefetch_four_cachelines(k[i]);
-       }
-
-       for (i = 0; i < b->nsets; i++) {
-               struct bset_tree *t = b->set + i;
-               struct bkey_packed *end = btree_bkey_last(b, t);
-
-               k[i] = bch2_bset_search_linear(b, t, search,
-                                              packed_search, &p, k[i]);
-               if (k[i] != end)
-                       *pos++ = (struct btree_node_iter_set) {
-                               __btree_node_key_to_offset(b, k[i]),
-                               __btree_node_key_to_offset(b, end)
-                       };
-       }
-
-       bch2_btree_node_iter_sort(iter, b);
-}
-
-void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
-                                         struct btree *b)
-{
-       memset(iter, 0, sizeof(*iter));
-
-       for_each_bset(b, t)
-               __bch2_btree_node_iter_push(iter, b,
-                                          btree_bkey_first(b, t),
-                                          btree_bkey_last(b, t));
-       bch2_btree_node_iter_sort(iter, b);
-}
-
-struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
-                                                 struct btree *b,
-                                                 struct bset_tree *t)
-{
-       struct btree_node_iter_set *set;
-
-       btree_node_iter_for_each(iter, set)
-               if (set->end == t->end_offset)
-                       return __btree_node_offset_to_key(b, set->k);
-
-       return btree_bkey_last(b, t);
-}
-
-static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
-                                           struct btree *b,
-                                           unsigned first)
-{
-       bool ret;
-
-       if ((ret = (btree_node_iter_cmp(b,
-                                       iter->data[first],
-                                       iter->data[first + 1]) > 0)))
-               swap(iter->data[first], iter->data[first + 1]);
-       return ret;
-}
-
-void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
-                              struct btree *b)
-{
-       /* unrolled bubble sort: */
-
-       if (!__btree_node_iter_set_end(iter, 2)) {
-               btree_node_iter_sort_two(iter, b, 0);
-               btree_node_iter_sort_two(iter, b, 1);
-       }
-
-       if (!__btree_node_iter_set_end(iter, 1))
-               btree_node_iter_sort_two(iter, b, 0);
-}
-
-void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
-                                  struct btree_node_iter_set *set)
-{
-       struct btree_node_iter_set *last =
-               iter->data + ARRAY_SIZE(iter->data) - 1;
-
-       memmove(&set[0], &set[1], (void *) last - (void *) set);
-       *last = (struct btree_node_iter_set) { 0, 0 };
-}
-
-static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
-                                                 struct btree *b)
-{
-       iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
-
-       EBUG_ON(iter->data->k > iter->data->end);
-
-       if (unlikely(__btree_node_iter_set_end(iter, 0))) {
-               /* avoid an expensive memmove call: */
-               iter->data[0] = iter->data[1];
-               iter->data[1] = iter->data[2];
-               iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
-               return;
-       }
-
-       if (__btree_node_iter_set_end(iter, 1))
-               return;
-
-       if (!btree_node_iter_sort_two(iter, b, 0))
-               return;
-
-       if (__btree_node_iter_set_end(iter, 2))
-               return;
-
-       btree_node_iter_sort_two(iter, b, 1);
-}
-
-void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
-                                 struct btree *b)
-{
-       if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
-               __bch2_btree_node_iter_verify(iter, b);
-               __bch2_btree_node_iter_next_check(iter, b);
-       }
-
-       __bch2_btree_node_iter_advance(iter, b);
-}
-
-/*
- * Expensive:
- */
-struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
-                                                 struct btree *b)
-{
-       struct bkey_packed *k, *prev = NULL;
-       struct btree_node_iter_set *set;
-       unsigned end = 0;
-
-       bch2_btree_node_iter_verify(iter, b);
-
-       for_each_bset(b, t) {
-               k = bch2_bkey_prev_all(b, t,
-                       bch2_btree_node_iter_bset_pos(iter, b, t));
-               if (k &&
-                   (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
-                       prev = k;
-                       end = t->end_offset;
-               }
-       }
-
-       if (!prev)
-               return NULL;
-
-       /*
-        * We're manually memmoving instead of just calling sort() to ensure the
-        * prev we picked ends up in slot 0 - sort won't necessarily put it
-        * there because of duplicate deleted keys:
-        */
-       btree_node_iter_for_each(iter, set)
-               if (set->end == end)
-                       goto found;
-
-       BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
-found:
-       BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
-
-       memmove(&iter->data[1],
-               &iter->data[0],
-               (void *) set - (void *) &iter->data[0]);
-
-       iter->data[0].k = __btree_node_key_to_offset(b, prev);
-       iter->data[0].end = end;
-
-       bch2_btree_node_iter_verify(iter, b);
-       return prev;
-}
-
-struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
-                                             struct btree *b)
-{
-       struct bkey_packed *prev;
-
-       do {
-               prev = bch2_btree_node_iter_prev_all(iter, b);
-       } while (prev && bkey_deleted(prev));
-
-       return prev;
-}
-
-struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
-                                                struct btree *b,
-                                                struct bkey *u)
-{
-       struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
-
-       return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
-}
-
-/* Mergesort */
-
-void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
-{
-       for_each_bset_c(b, t) {
-               enum bset_aux_tree_type type = bset_aux_tree_type(t);
-               size_t j;
-
-               stats->sets[type].nr++;
-               stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
-                       sizeof(u64);
-
-               if (bset_has_ro_aux_tree(t)) {
-                       stats->floats += t->size - 1;
-
-                       for (j = 1; j < t->size; j++)
-                               stats->failed +=
-                                       bkey_float(b, t, j)->exponent ==
-                                       BFLOAT_FAILED;
-               }
-       }
-}
-
-void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
-                        struct bkey_packed *k)
-{
-       struct bset_tree *t = bch2_bkey_to_bset(b, k);
-       struct bkey uk;
-       unsigned j, inorder;
-
-       if (!bset_has_ro_aux_tree(t))
-               return;
-
-       inorder = bkey_to_cacheline(b, t, k);
-       if (!inorder || inorder >= t->size)
-               return;
-
-       j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
-       if (k != tree_to_bkey(b, t, j))
-               return;
-
-       switch (bkey_float(b, t, j)->exponent) {
-       case BFLOAT_FAILED:
-               uk = bkey_unpack_key(b, k);
-               prt_printf(out,
-                      "    failed unpacked at depth %u\n"
-                      "\t",
-                      ilog2(j));
-               bch2_bpos_to_text(out, uk.p);
-               prt_printf(out, "\n");
-               break;
-       }
-}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
deleted file mode 100644 (file)
index a15ecf9..0000000
+++ /dev/null
@@ -1,536 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BSET_H
-#define _BCACHEFS_BSET_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "btree_types.h"
-#include "util.h" /* for time_stats */
-#include "vstructs.h"
-
-/*
- * BKEYS:
- *
- * A bkey contains a key, a size field, a variable number of pointers, and some
- * ancillary flag bits.
- *
- * We use two different functions for validating bkeys, bkey_invalid and
- * bkey_deleted().
- *
- * The one exception to the rule that ptr_invalid() filters out invalid keys is
- * that it also filters out keys of size 0 - these are keys that have been
- * completely overwritten. It'd be safe to delete these in memory while leaving
- * them on disk, just unnecessary work - so we filter them out when resorting
- * instead.
- *
- * We can't filter out stale keys when we're resorting, because garbage
- * collection needs to find them to ensure bucket gens don't wrap around -
- * unless we're rewriting the btree node those stale keys still exist on disk.
- *
- * We also implement functions here for removing some number of sectors from the
- * front or the back of a bkey - this is mainly used for fixing overlapping
- * extents, by removing the overlapping sectors from the older key.
- *
- * BSETS:
- *
- * A bset is an array of bkeys laid out contiguously in memory in sorted order,
- * along with a header. A btree node is made up of a number of these, written at
- * different times.
- *
- * There could be many of them on disk, but we never allow there to be more than
- * 4 in memory - we lazily resort as needed.
- *
- * We implement code here for creating and maintaining auxiliary search trees
- * (described below) for searching an individial bset, and on top of that we
- * implement a btree iterator.
- *
- * BTREE ITERATOR:
- *
- * Most of the code in bcache doesn't care about an individual bset - it needs
- * to search entire btree nodes and iterate over them in sorted order.
- *
- * The btree iterator code serves both functions; it iterates through the keys
- * in a btree node in sorted order, starting from either keys after a specific
- * point (if you pass it a search key) or the start of the btree node.
- *
- * AUXILIARY SEARCH TREES:
- *
- * Since keys are variable length, we can't use a binary search on a bset - we
- * wouldn't be able to find the start of the next key. But binary searches are
- * slow anyways, due to terrible cache behaviour; bcache originally used binary
- * searches and that code topped out at under 50k lookups/second.
- *
- * So we need to construct some sort of lookup table. Since we only insert keys
- * into the last (unwritten) set, most of the keys within a given btree node are
- * usually in sets that are mostly constant. We use two different types of
- * lookup tables to take advantage of this.
- *
- * Both lookup tables share in common that they don't index every key in the
- * set; they index one key every BSET_CACHELINE bytes, and then a linear search
- * is used for the rest.
- *
- * For sets that have been written to disk and are no longer being inserted
- * into, we construct a binary search tree in an array - traversing a binary
- * search tree in an array gives excellent locality of reference and is very
- * fast, since both children of any node are adjacent to each other in memory
- * (and their grandchildren, and great grandchildren...) - this means
- * prefetching can be used to great effect.
- *
- * It's quite useful performance wise to keep these nodes small - not just
- * because they're more likely to be in L2, but also because we can prefetch
- * more nodes on a single cacheline and thus prefetch more iterations in advance
- * when traversing this tree.
- *
- * Nodes in the auxiliary search tree must contain both a key to compare against
- * (we don't want to fetch the key from the set, that would defeat the purpose),
- * and a pointer to the key. We use a few tricks to compress both of these.
- *
- * To compress the pointer, we take advantage of the fact that one node in the
- * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
- * a function (to_inorder()) that takes the index of a node in a binary tree and
- * returns what its index would be in an inorder traversal, so we only have to
- * store the low bits of the offset.
- *
- * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
- * compress that,  we take advantage of the fact that when we're traversing the
- * search tree at every iteration we know that both our search key and the key
- * we're looking for lie within some range - bounded by our previous
- * comparisons. (We special case the start of a search so that this is true even
- * at the root of the tree).
- *
- * So we know the key we're looking for is between a and b, and a and b don't
- * differ higher than bit 50, we don't need to check anything higher than bit
- * 50.
- *
- * We don't usually need the rest of the bits, either; we only need enough bits
- * to partition the key range we're currently checking.  Consider key n - the
- * key our auxiliary search tree node corresponds to, and key p, the key
- * immediately preceding n.  The lowest bit we need to store in the auxiliary
- * search tree is the highest bit that differs between n and p.
- *
- * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
- * comparison. But we'd really like our nodes in the auxiliary search tree to be
- * of fixed size.
- *
- * The solution is to make them fixed size, and when we're constructing a node
- * check if p and n differed in the bits we needed them to. If they don't we
- * flag that node, and when doing lookups we fallback to comparing against the
- * real key. As long as this doesn't happen to often (and it seems to reliably
- * happen a bit less than 1% of the time), we win - even on failures, that key
- * is then more likely to be in cache than if we were doing binary searches all
- * the way, since we're touching so much less memory.
- *
- * The keys in the auxiliary search tree are stored in (software) floating
- * point, with an exponent and a mantissa. The exponent needs to be big enough
- * to address all the bits in the original key, but the number of bits in the
- * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
- *
- * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
- * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
- * We need one node per 128 bytes in the btree node, which means the auxiliary
- * search trees take up 3% as much memory as the btree itself.
- *
- * Constructing these auxiliary search trees is moderately expensive, and we
- * don't want to be constantly rebuilding the search tree for the last set
- * whenever we insert another key into it. For the unwritten set, we use a much
- * simpler lookup table - it's just a flat array, so index i in the lookup table
- * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
- * within each byte range works the same as with the auxiliary search trees.
- *
- * These are much easier to keep up to date when we insert a key - we do it
- * somewhat lazily; when we shift a key up we usually just increment the pointer
- * to it, only when it would overflow do we go to the trouble of finding the
- * first key in that range of bytes again.
- */
-
-enum bset_aux_tree_type {
-       BSET_NO_AUX_TREE,
-       BSET_RO_AUX_TREE,
-       BSET_RW_AUX_TREE,
-};
-
-#define BSET_TREE_NR_TYPES     3
-
-#define BSET_NO_AUX_TREE_VAL   (U16_MAX)
-#define BSET_RW_AUX_TREE_VAL   (U16_MAX - 1)
-
-static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
-{
-       switch (t->extra) {
-       case BSET_NO_AUX_TREE_VAL:
-               EBUG_ON(t->size);
-               return BSET_NO_AUX_TREE;
-       case BSET_RW_AUX_TREE_VAL:
-               EBUG_ON(!t->size);
-               return BSET_RW_AUX_TREE;
-       default:
-               EBUG_ON(!t->size);
-               return BSET_RO_AUX_TREE;
-       }
-}
-
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE         256
-
-static inline size_t btree_keys_cachelines(const struct btree *b)
-{
-       return (1U << b->byte_order) / BSET_CACHELINE;
-}
-
-static inline size_t btree_aux_data_bytes(const struct btree *b)
-{
-       return btree_keys_cachelines(b) * 8;
-}
-
-static inline size_t btree_aux_data_u64s(const struct btree *b)
-{
-       return btree_aux_data_bytes(b) / sizeof(u64);
-}
-
-#define for_each_bset(_b, _t)                                          \
-       for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-#define for_each_bset_c(_b, _t)                                                \
-       for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-#define bset_tree_for_each_key(_b, _t, _k)                             \
-       for (_k = btree_bkey_first(_b, _t);                             \
-            _k != btree_bkey_last(_b, _t);                             \
-            _k = bkey_p_next(_k))
-
-static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
-{
-       return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
-}
-
-static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
-{
-       return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
-}
-
-static inline void bch2_bset_set_no_aux_tree(struct btree *b,
-                                           struct bset_tree *t)
-{
-       BUG_ON(t < b->set);
-
-       for (; t < b->set + ARRAY_SIZE(b->set); t++) {
-               t->size = 0;
-               t->extra = BSET_NO_AUX_TREE_VAL;
-               t->aux_data_offset = U16_MAX;
-       }
-}
-
-static inline void btree_node_set_format(struct btree *b,
-                                        struct bkey_format f)
-{
-       int len;
-
-       b->format       = f;
-       b->nr_key_bits  = bkey_format_key_bits(&f);
-
-       len = bch2_compile_bkey_format(&b->format, b->aux_data);
-       BUG_ON(len < 0 || len > U8_MAX);
-
-       b->unpack_fn_len = len;
-
-       bch2_bset_set_no_aux_tree(b, b->set);
-}
-
-static inline struct bset *bset_next_set(struct btree *b,
-                                        unsigned block_bytes)
-{
-       struct bset *i = btree_bset_last(b);
-
-       EBUG_ON(!is_power_of_2(block_bytes));
-
-       return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
-}
-
-void bch2_btree_keys_init(struct btree *);
-
-void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
-void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-
-void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
-                     unsigned);
-void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
-
-/* Bkey utility code */
-
-/* packed or unpacked */
-static inline int bkey_cmp_p_or_unp(const struct btree *b,
-                                   const struct bkey_packed *l,
-                                   const struct bkey_packed *r_packed,
-                                   const struct bpos *r)
-{
-       EBUG_ON(r_packed && !bkey_packed(r_packed));
-
-       if (unlikely(!bkey_packed(l)))
-               return bpos_cmp(packed_to_bkey_c(l)->p, *r);
-
-       if (likely(r_packed))
-               return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
-
-       return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-static inline struct bset_tree *
-bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
-{
-       unsigned offset = __btree_node_key_to_offset(b, k);
-
-       for_each_bset(b, t)
-               if (offset <= t->end_offset) {
-                       EBUG_ON(offset < btree_bkey_first_offset(t));
-                       return t;
-               }
-
-       BUG();
-}
-
-struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
-
-struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
-                                         struct bkey_packed *, unsigned);
-
-static inline struct bkey_packed *
-bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
-{
-       return bch2_bkey_prev_filter(b, t, k, 0);
-}
-
-static inline struct bkey_packed *
-bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
-{
-       return bch2_bkey_prev_filter(b, t, k, 1);
-}
-
-/* Btree key iteration */
-
-void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
-                             const struct bkey_packed *,
-                             const struct bkey_packed *);
-void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
-                              struct bpos *);
-void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
-                                         struct btree *);
-struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
-                                                struct btree *,
-                                                struct bset_tree *);
-
-void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
-void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
-                                  struct btree_node_iter_set *);
-void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
-
-#define btree_node_iter_for_each(_iter, _set)                          \
-       for (_set = (_iter)->data;                                      \
-            _set < (_iter)->data + ARRAY_SIZE((_iter)->data) &&        \
-            (_set)->k != (_set)->end;                                  \
-            _set++)
-
-static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
-                                            unsigned i)
-{
-       return iter->data[i].k == iter->data[i].end;
-}
-
-static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
-{
-       return __btree_node_iter_set_end(iter, 0);
-}
-
-/*
- * When keys compare equal, deleted keys compare first:
- *
- * XXX: only need to compare pointers for keys that are both within a
- * btree_node_iterator - we need to break ties for prev() to work correctly
- */
-static inline int bkey_iter_cmp(const struct btree *b,
-                               const struct bkey_packed *l,
-                               const struct bkey_packed *r)
-{
-       return bch2_bkey_cmp_packed(b, l, r)
-               ?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
-               ?: cmp_int(l, r);
-}
-
-static inline int btree_node_iter_cmp(const struct btree *b,
-                                     struct btree_node_iter_set l,
-                                     struct btree_node_iter_set r)
-{
-       return bkey_iter_cmp(b,
-                       __btree_node_offset_to_key(b, l.k),
-                       __btree_node_offset_to_key(b, r.k));
-}
-
-/* These assume r (the search key) is not a deleted key: */
-static inline int bkey_iter_pos_cmp(const struct btree *b,
-                       const struct bkey_packed *l,
-                       const struct bpos *r)
-{
-       return bkey_cmp_left_packed(b, l, r)
-               ?: -((int) bkey_deleted(l));
-}
-
-static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
-                                   const struct bkey_packed *l,
-                                   const struct bkey_packed *r_packed,
-                                   const struct bpos *r)
-{
-       return bkey_cmp_p_or_unp(b, l, r_packed, r)
-               ?: -((int) bkey_deleted(l));
-}
-
-static inline struct bkey_packed *
-__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
-                               struct btree *b)
-{
-       return __btree_node_offset_to_key(b, iter->data->k);
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
-{
-       return !bch2_btree_node_iter_end(iter)
-               ? __btree_node_offset_to_key(b, iter->data->k)
-               : NULL;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
-{
-       struct bkey_packed *k;
-
-       while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
-              bkey_deleted(k))
-               bch2_btree_node_iter_advance(iter, b);
-
-       return k;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
-{
-       struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
-
-       if (ret)
-               bch2_btree_node_iter_advance(iter, b);
-
-       return ret;
-}
-
-struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
-                                                 struct btree *);
-struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
-                                             struct btree *);
-
-struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
-                                               struct btree *,
-                                               struct bkey *);
-
-#define for_each_btree_node_key(b, k, iter)                            \
-       for (bch2_btree_node_iter_init_from_start((iter), (b));         \
-            (k = bch2_btree_node_iter_peek((iter), (b)));              \
-            bch2_btree_node_iter_advance(iter, b))
-
-#define for_each_btree_node_key_unpack(b, k, iter, unpacked)           \
-       for (bch2_btree_node_iter_init_from_start((iter), (b));         \
-            (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
-            bch2_btree_node_iter_advance(iter, b))
-
-/* Accounting: */
-
-struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
-
-static inline void btree_keys_account_key(struct btree_nr_keys *n,
-                                         unsigned bset,
-                                         struct bkey_packed *k,
-                                         int sign)
-{
-       n->live_u64s            += k->u64s * sign;
-       n->bset_u64s[bset]      += k->u64s * sign;
-
-       if (bkey_packed(k))
-               n->packed_keys  += sign;
-       else
-               n->unpacked_keys += sign;
-}
-
-static inline void btree_keys_account_val_delta(struct btree *b,
-                                               struct bkey_packed *k,
-                                               int delta)
-{
-       struct bset_tree *t = bch2_bkey_to_bset(b, k);
-
-       b->nr.live_u64s                 += delta;
-       b->nr.bset_u64s[t - b->set]     += delta;
-}
-
-#define btree_keys_account_key_add(_nr, _bset_idx, _k)         \
-       btree_keys_account_key(_nr, _bset_idx, _k, 1)
-#define btree_keys_account_key_drop(_nr, _bset_idx, _k)        \
-       btree_keys_account_key(_nr, _bset_idx, _k, -1)
-
-#define btree_account_key_add(_b, _k)                          \
-       btree_keys_account_key(&(_b)->nr,                       \
-               bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
-#define btree_account_key_drop(_b, _k)                         \
-       btree_keys_account_key(&(_b)->nr,                       \
-               bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
-
-struct bset_stats {
-       struct {
-               size_t nr, bytes;
-       } sets[BSET_TREE_NR_TYPES];
-
-       size_t floats;
-       size_t failed;
-};
-
-void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
-void bch2_bfloat_to_text(struct printbuf *, struct btree *,
-                        struct bkey_packed *);
-
-/* Debug stuff */
-
-void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
-void bch2_dump_btree_node(struct bch_fs *, struct btree *);
-void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
-
-void __bch2_verify_btree_nr_keys(struct btree *);
-void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-
-static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
-                                              struct btree *b)
-{
-       if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
-               __bch2_btree_node_iter_verify(iter, b);
-}
-
-static inline void bch2_verify_btree_nr_keys(struct btree *b)
-{
-       if (static_branch_unlikely(&bch2_debug_check_btree_accounting))
-               __bch2_verify_btree_nr_keys(b);
-}
-
-#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
deleted file mode 100644 (file)
index 83c9860..0000000
+++ /dev/null
@@ -1,1516 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "trace.h"
-
-#include <linux/prefetch.h>
-#include <linux/sched/mm.h>
-#include <linux/swap.h>
-
-const char * const bch2_btree_node_flags[] = {
-       "typebit",
-       "typebit",
-       "typebit",
-#define x(f)   [BTREE_NODE_##f] = #f,
-       BTREE_FLAGS()
-#undef x
-       NULL
-};
-
-void bch2_recalc_btree_reserve(struct bch_fs *c)
-{
-       unsigned reserve = 16;
-
-       if (!c->btree_roots_known[0].b)
-               reserve += 8;
-
-       for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
-               struct btree_root *r = bch2_btree_id_root(c, i);
-
-               if (r->b)
-                       reserve += min_t(unsigned, 1, r->b->c.level) * 8;
-       }
-
-       c->btree_cache.nr_reserve = reserve;
-}
-
-static inline size_t btree_cache_can_free(struct btree_cache_list *list)
-{
-       struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
-
-       size_t can_free = list->nr;
-       if (!list->idx)
-               can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
-       return can_free;
-}
-
-static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
-{
-       BUG_ON(!list_empty(&b->list));
-
-       if (b->c.lock.readers)
-               list_add(&b->list, &bc->freed_pcpu);
-       else
-               list_add(&b->list, &bc->freed_nonpcpu);
-}
-
-static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b)
-{
-       BUG_ON(!list_empty(&b->list));
-       BUG_ON(!b->data);
-
-       bc->nr_freeable++;
-       list_add(&b->list, &bc->freeable);
-}
-
-void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
-{
-       struct btree_cache *bc = &c->btree_cache;
-
-       mutex_lock(&bc->lock);
-       __bch2_btree_node_to_freelist(bc, b);
-       mutex_unlock(&bc->lock);
-
-       six_unlock_write(&b->c.lock);
-       six_unlock_intent(&b->c.lock);
-}
-
-void __btree_node_data_free(struct btree *b)
-{
-       BUG_ON(!list_empty(&b->list));
-       BUG_ON(btree_node_hashed(b));
-
-       /*
-        * This should really be done in slub/vmalloc, but we're using the
-        * kmalloc_large() path, so we're working around a slub bug by doing
-        * this here:
-        */
-       if (b->data)
-               mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
-       if (b->aux_data)
-               mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
-
-       EBUG_ON(btree_node_write_in_flight(b));
-
-       clear_btree_node_just_written(b);
-
-       kvfree(b->data);
-       b->data = NULL;
-#ifdef __KERNEL__
-       kvfree(b->aux_data);
-#else
-       munmap(b->aux_data, btree_aux_data_bytes(b));
-#endif
-       b->aux_data = NULL;
-}
-
-static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
-{
-       BUG_ON(list_empty(&b->list));
-       list_del_init(&b->list);
-
-       __btree_node_data_free(b);
-
-       --bc->nr_freeable;
-       btree_node_to_freedlist(bc, b);
-}
-
-static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
-                                  const void *obj)
-{
-       const struct btree *b = obj;
-       const u64 *v = arg->key;
-
-       return b->hash_val == *v ? 0 : 1;
-}
-
-static const struct rhashtable_params bch_btree_cache_params = {
-       .head_offset            = offsetof(struct btree, hash),
-       .key_offset             = offsetof(struct btree, hash_val),
-       .key_len                = sizeof(u64),
-       .obj_cmpfn              = bch2_btree_cache_cmp_fn,
-       .automatic_shrinking    = true,
-};
-
-static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
-{
-       BUG_ON(b->data || b->aux_data);
-
-       gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
-
-       b->data = kvmalloc(btree_buf_bytes(b), gfp);
-       if (!b->data)
-               return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
-#ifdef __KERNEL__
-       b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
-#else
-       b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
-                          PROT_READ|PROT_WRITE|PROT_EXEC,
-                          MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
-       if (b->aux_data == MAP_FAILED)
-               b->aux_data = NULL;
-#endif
-       if (!b->aux_data) {
-               kvfree(b->data);
-               b->data = NULL;
-               return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
-       }
-
-       return 0;
-}
-
-static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
-{
-       struct btree *b;
-
-       b = kzalloc(sizeof(struct btree), gfp);
-       if (!b)
-               return NULL;
-
-       bkey_btree_ptr_init(&b->key);
-       INIT_LIST_HEAD(&b->list);
-       INIT_LIST_HEAD(&b->write_blocked);
-       b->byte_order = ilog2(c->opts.btree_node_size);
-       return b;
-}
-
-struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
-{
-       struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
-       if (!b)
-               return NULL;
-
-       if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
-               kfree(b);
-               return NULL;
-       }
-
-       bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
-       return b;
-}
-
-static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
-{
-       struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
-
-       u64 mask = bc->pinned_nodes_mask[!!b->c.level];
-
-       return ((mask & BIT_ULL(b->c.btree_id)) &&
-               bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
-               bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
-}
-
-void bch2_node_pin(struct bch_fs *c, struct btree *b)
-{
-       struct btree_cache *bc = &c->btree_cache;
-
-       mutex_lock(&bc->lock);
-       if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
-               set_btree_node_pinned(b);
-               list_move(&b->list, &bc->live[1].list);
-               bc->live[0].nr--;
-               bc->live[1].nr++;
-       }
-       mutex_unlock(&bc->lock);
-}
-
-void bch2_btree_cache_unpin(struct bch_fs *c)
-{
-       struct btree_cache *bc = &c->btree_cache;
-       struct btree *b, *n;
-
-       mutex_lock(&bc->lock);
-       c->btree_cache.pinned_nodes_mask[0] = 0;
-       c->btree_cache.pinned_nodes_mask[1] = 0;
-
-       list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
-               clear_btree_node_pinned(b);
-               list_move(&b->list, &bc->live[0].list);
-               bc->live[0].nr++;
-               bc->live[1].nr--;
-       }
-
-       mutex_unlock(&bc->lock);
-}
-
-/* Btree in memory cache - hash table */
-
-void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
-{
-       lockdep_assert_held(&bc->lock);
-
-       int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
-       BUG_ON(ret);
-
-       /* Cause future lookups for this node to fail: */
-       b->hash_val = 0;
-
-       if (b->c.btree_id < BTREE_ID_NR)
-               --bc->nr_by_btree[b->c.btree_id];
-       --bc->live[btree_node_pinned(b)].nr;
-       list_del_init(&b->list);
-}
-
-void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
-{
-       __bch2_btree_node_hash_remove(bc, b);
-       __bch2_btree_node_to_freelist(bc, b);
-}
-
-int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
-{
-       BUG_ON(!list_empty(&b->list));
-       BUG_ON(b->hash_val);
-
-       b->hash_val = btree_ptr_hash_val(&b->key);
-       int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
-                                               bch_btree_cache_params);
-       if (ret)
-               return ret;
-
-       if (b->c.btree_id < BTREE_ID_NR)
-               bc->nr_by_btree[b->c.btree_id]++;
-
-       bool p = __btree_node_pinned(bc, b);
-       mod_bit(BTREE_NODE_pinned, &b->flags, p);
-
-       list_add_tail(&b->list, &bc->live[p].list);
-       bc->live[p].nr++;
-       return 0;
-}
-
-int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
-                               unsigned level, enum btree_id id)
-{
-       b->c.level      = level;
-       b->c.btree_id   = id;
-
-       mutex_lock(&bc->lock);
-       int ret = __bch2_btree_node_hash_insert(bc, b);
-       mutex_unlock(&bc->lock);
-
-       return ret;
-}
-
-void bch2_btree_node_update_key_early(struct btree_trans *trans,
-                                     enum btree_id btree, unsigned level,
-                                     struct bkey_s_c old, struct bkey_i *new)
-{
-       struct bch_fs *c = trans->c;
-       struct btree *b;
-       struct bkey_buf tmp;
-       int ret;
-
-       bch2_bkey_buf_init(&tmp);
-       bch2_bkey_buf_reassemble(&tmp, c, old);
-
-       b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
-       if (!IS_ERR_OR_NULL(b)) {
-               mutex_lock(&c->btree_cache.lock);
-
-               __bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-               bkey_copy(&b->key, new);
-               ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-               BUG_ON(ret);
-
-               mutex_unlock(&c->btree_cache.lock);
-               six_unlock_read(&b->c.lock);
-       }
-
-       bch2_bkey_buf_exit(&tmp, c);
-}
-
-__flatten
-static inline struct btree *btree_cache_find(struct btree_cache *bc,
-                                    const struct bkey_i *k)
-{
-       u64 v = btree_ptr_hash_val(k);
-
-       return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
-}
-
-static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b,
-                                      bool flush, bool locked)
-{
-       struct btree_cache *bc = &c->btree_cache;
-
-       lockdep_assert_held(&bc->lock);
-
-       if (btree_node_noevict(b)) {
-               bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++;
-               return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-       }
-       if (btree_node_write_blocked(b)) {
-               bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++;
-               return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-       }
-       if (btree_node_will_make_reachable(b)) {
-               bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++;
-               return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-       }
-
-       if (btree_node_dirty(b)) {
-               if (!flush) {
-                       bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++;
-                       return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-               }
-
-               if (locked) {
-                       /*
-                        * Using the underscore version because we don't want to compact
-                        * bsets after the write, since this node is about to be evicted
-                        * - unless btree verify mode is enabled, since it runs out of
-                        * the post write cleanup:
-                        */
-                       if (static_branch_unlikely(&bch2_verify_btree_ondisk))
-                               bch2_btree_node_write(c, b, SIX_LOCK_intent,
-                                                     BTREE_WRITE_cache_reclaim);
-                       else
-                               __bch2_btree_node_write(c, b,
-                                                       BTREE_WRITE_cache_reclaim);
-               }
-       }
-
-       if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
-                       (1U << BTREE_NODE_write_in_flight))) {
-               if (!flush) {
-                       if (btree_node_read_in_flight(b))
-                               bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++;
-                       else if (btree_node_write_in_flight(b))
-                               bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++;
-                       return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-               }
-
-               if (locked)
-                       return -EINTR;
-
-               /* XXX: waiting on IO with btree cache lock held */
-               bch2_btree_node_wait_on_read(b);
-               bch2_btree_node_wait_on_write(b);
-       }
-
-       return 0;
-}
-
-/*
- * this version is for btree nodes that have already been freed (we're not
- * reaping a real btree node)
- */
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
-{
-       struct btree_cache *bc = &c->btree_cache;
-       int ret = 0;
-
-       lockdep_assert_held(&bc->lock);
-retry_unlocked:
-       ret = __btree_node_reclaim_checks(c, b, flush, false);
-       if (ret)
-               return ret;
-
-       if (!six_trylock_intent(&b->c.lock)) {
-               bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++;
-               return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-       }
-
-       if (!six_trylock_write(&b->c.lock)) {
-               bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++;
-               six_unlock_intent(&b->c.lock);
-               return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-       }
-
-       /* recheck under lock */
-       ret = __btree_node_reclaim_checks(c, b, flush, true);
-       if (ret) {
-               six_unlock_write(&b->c.lock);
-               six_unlock_intent(&b->c.lock);
-               if (ret == -EINTR)
-                       goto retry_unlocked;
-               return ret;
-       }
-
-       if (b->hash_val && !ret)
-               trace_and_count(c, btree_cache_reap, c, b);
-       return 0;
-}
-
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
-{
-       return __btree_node_reclaim(c, b, false);
-}
-
-static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
-{
-       return __btree_node_reclaim(c, b, true);
-}
-
-static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
-                                          struct shrink_control *sc)
-{
-       struct btree_cache_list *list = shrink->private_data;
-       struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
-       struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
-       struct btree *b, *t;
-       unsigned long nr = sc->nr_to_scan;
-       unsigned long can_free = 0;
-       unsigned long freed = 0;
-       unsigned long touched = 0;
-       unsigned i, flags;
-       unsigned long ret = SHRINK_STOP;
-       bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
-
-       if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
-               return SHRINK_STOP;
-
-       mutex_lock(&bc->lock);
-       flags = memalloc_nofs_save();
-
-       /*
-        * It's _really_ critical that we don't free too many btree nodes - we
-        * have to always leave ourselves a reserve. The reserve is how we
-        * guarantee that allocating memory for a new btree node can always
-        * succeed, so that inserting keys into the btree can always succeed and
-        * IO can always make forward progress:
-        */
-       can_free = btree_cache_can_free(list);
-       if (nr > can_free) {
-               bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free;
-               nr = can_free;
-       }
-
-       i = 0;
-       list_for_each_entry_safe(b, t, &bc->freeable, list) {
-               /*
-                * Leave a few nodes on the freeable list, so that a btree split
-                * won't have to hit the system allocator:
-                */
-               if (++i <= 3)
-                       continue;
-
-               touched++;
-
-               if (touched >= nr)
-                       goto out;
-
-               if (!btree_node_reclaim(c, b)) {
-                       btree_node_data_free(bc, b);
-                       six_unlock_write(&b->c.lock);
-                       six_unlock_intent(&b->c.lock);
-                       freed++;
-                       bc->nr_freed++;
-               }
-       }
-restart:
-       list_for_each_entry_safe(b, t, &list->list, list) {
-               touched++;
-
-               if (btree_node_accessed(b)) {
-                       clear_btree_node_accessed(b);
-                       bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
-                       --touched;;
-               } else if (!btree_node_reclaim(c, b)) {
-                       __bch2_btree_node_hash_remove(bc, b);
-                       __btree_node_data_free(b);
-                       btree_node_to_freedlist(bc, b);
-
-                       freed++;
-                       bc->nr_freed++;
-
-                       six_unlock_write(&b->c.lock);
-                       six_unlock_intent(&b->c.lock);
-
-                       if (freed == nr)
-                               goto out_rotate;
-               } else if (trigger_writes &&
-                          btree_node_dirty(b) &&
-                          !btree_node_will_make_reachable(b) &&
-                          !btree_node_write_blocked(b) &&
-                          six_trylock_read(&b->c.lock)) {
-                       list_move(&list->list, &b->list);
-                       mutex_unlock(&bc->lock);
-                       __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
-                       six_unlock_read(&b->c.lock);
-                       if (touched >= nr)
-                               goto out_nounlock;
-                       mutex_lock(&bc->lock);
-                       goto restart;
-               }
-
-               if (touched >= nr)
-                       break;
-       }
-out_rotate:
-       if (&t->list != &list->list)
-               list_move_tail(&list->list, &t->list);
-out:
-       mutex_unlock(&bc->lock);
-out_nounlock:
-       ret = freed;
-       memalloc_nofs_restore(flags);
-       trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
-       return ret;
-}
-
-static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
-                                           struct shrink_control *sc)
-{
-       struct btree_cache_list *list = shrink->private_data;
-
-       if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
-               return 0;
-
-       return btree_cache_can_free(list);
-}
-
-void bch2_fs_btree_cache_exit(struct bch_fs *c)
-{
-       struct btree_cache *bc = &c->btree_cache;
-       struct btree *b, *t;
-       unsigned long flags;
-
-       shrinker_free(bc->live[1].shrink);
-       shrinker_free(bc->live[0].shrink);
-
-       /* vfree() can allocate memory: */
-       flags = memalloc_nofs_save();
-       mutex_lock(&bc->lock);
-
-       if (c->verify_data)
-               list_move(&c->verify_data->list, &bc->live[0].list);
-
-       kvfree(c->verify_ondisk);
-
-       for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
-               struct btree_root *r = bch2_btree_id_root(c, i);
-
-               if (r->b)
-                       list_add(&r->b->list, &bc->live[0].list);
-       }
-
-       list_for_each_entry_safe(b, t, &bc->live[1].list, list)
-               bch2_btree_node_hash_remove(bc, b);
-       list_for_each_entry_safe(b, t, &bc->live[0].list, list)
-               bch2_btree_node_hash_remove(bc, b);
-
-       list_for_each_entry_safe(b, t, &bc->freeable, list) {
-               BUG_ON(btree_node_read_in_flight(b) ||
-                      btree_node_write_in_flight(b));
-
-               btree_node_data_free(bc, b);
-               cond_resched();
-       }
-
-       BUG_ON(!bch2_journal_error(&c->journal) &&
-              atomic_long_read(&c->btree_cache.nr_dirty));
-
-       list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
-
-       list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
-               list_del(&b->list);
-               six_lock_exit(&b->c.lock);
-               kfree(b);
-       }
-
-       mutex_unlock(&bc->lock);
-       memalloc_nofs_restore(flags);
-
-       for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
-               BUG_ON(bc->nr_by_btree[i]);
-       BUG_ON(bc->live[0].nr);
-       BUG_ON(bc->live[1].nr);
-       BUG_ON(bc->nr_freeable);
-
-       if (bc->table_init_done)
-               rhashtable_destroy(&bc->table);
-}
-
-int bch2_fs_btree_cache_init(struct bch_fs *c)
-{
-       struct btree_cache *bc = &c->btree_cache;
-       struct shrinker *shrink;
-       unsigned i;
-       int ret = 0;
-
-       ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
-       if (ret)
-               goto err;
-
-       bc->table_init_done = true;
-
-       bch2_recalc_btree_reserve(c);
-
-       for (i = 0; i < bc->nr_reserve; i++) {
-               struct btree *b = __bch2_btree_node_mem_alloc(c);
-               if (!b)
-                       goto err;
-               __bch2_btree_node_to_freelist(bc, b);
-       }
-
-       list_splice_init(&bc->live[0].list, &bc->freeable);
-
-       mutex_init(&c->verify_lock);
-
-       shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
-       if (!shrink)
-               goto err;
-       bc->live[0].shrink      = shrink;
-       shrink->count_objects   = bch2_btree_cache_count;
-       shrink->scan_objects    = bch2_btree_cache_scan;
-       shrink->seeks           = 2;
-       shrink->private_data    = &bc->live[0];
-       shrinker_register(shrink);
-
-       shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
-       if (!shrink)
-               goto err;
-       bc->live[1].shrink      = shrink;
-       shrink->count_objects   = bch2_btree_cache_count;
-       shrink->scan_objects    = bch2_btree_cache_scan;
-       shrink->seeks           = 8;
-       shrink->private_data    = &bc->live[1];
-       shrinker_register(shrink);
-
-       return 0;
-err:
-       return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-}
-
-void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
-{
-       mutex_init(&bc->lock);
-       for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
-               bc->live[i].idx = i;
-               INIT_LIST_HEAD(&bc->live[i].list);
-       }
-       INIT_LIST_HEAD(&bc->freeable);
-       INIT_LIST_HEAD(&bc->freed_pcpu);
-       INIT_LIST_HEAD(&bc->freed_nonpcpu);
-}
-
-/*
- * We can only have one thread cannibalizing other cached btree nodes at a time,
- * or we'll deadlock. We use an open coded mutex to ensure that, which a
- * cannibalize_bucket() will take. This means every time we unlock the root of
- * the btree, we need to release this lock if we have it held.
- */
-void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_cache *bc = &c->btree_cache;
-
-       if (bc->alloc_lock == current) {
-               trace_and_count(c, btree_cache_cannibalize_unlock, trans);
-               bc->alloc_lock = NULL;
-               closure_wake_up(&bc->alloc_wait);
-       }
-}
-
-int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_cache *bc = &c->btree_cache;
-       struct task_struct *old;
-
-       old = NULL;
-       if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
-               goto success;
-
-       if (!cl) {
-               trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
-               return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock);
-       }
-
-       closure_wait(&bc->alloc_wait, cl);
-
-       /* Try again, after adding ourselves to waitlist */
-       old = NULL;
-       if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
-               /* We raced */
-               closure_wake_up(&bc->alloc_wait);
-               goto success;
-       }
-
-       trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
-       return bch_err_throw(c, btree_cache_cannibalize_lock_blocked);
-
-success:
-       trace_and_count(c, btree_cache_cannibalize_lock, trans);
-       return 0;
-}
-
-static struct btree *btree_node_cannibalize(struct bch_fs *c)
-{
-       struct btree_cache *bc = &c->btree_cache;
-       struct btree *b;
-
-       for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
-               list_for_each_entry_reverse(b, &bc->live[i].list, list)
-                       if (!btree_node_reclaim(c, b))
-                               return b;
-
-       while (1) {
-               for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
-                       list_for_each_entry_reverse(b, &bc->live[i].list, list)
-                               if (!btree_node_write_and_reclaim(c, b))
-                                       return b;
-
-               /*
-                * Rare case: all nodes were intent-locked.
-                * Just busy-wait.
-                */
-               WARN_ONCE(1, "btree cache cannibalize failed\n");
-               cond_resched();
-       }
-}
-
-struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_cache *bc = &c->btree_cache;
-       struct list_head *freed = pcpu_read_locks
-               ? &bc->freed_pcpu
-               : &bc->freed_nonpcpu;
-       struct btree *b, *b2;
-       u64 start_time = local_clock();
-
-       mutex_lock(&bc->lock);
-
-       /*
-        * We never free struct btree itself, just the memory that holds the on
-        * disk node. Check the freed list before allocating a new one:
-        */
-       list_for_each_entry(b, freed, list)
-               if (!btree_node_reclaim(c, b)) {
-                       list_del_init(&b->list);
-                       goto got_node;
-               }
-
-       b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
-       if (b) {
-               bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
-       } else {
-               mutex_unlock(&bc->lock);
-               bch2_trans_unlock(trans);
-               b = __btree_node_mem_alloc(c, GFP_KERNEL);
-               if (!b)
-                       goto err;
-               bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
-               mutex_lock(&bc->lock);
-       }
-
-       BUG_ON(!six_trylock_intent(&b->c.lock));
-       BUG_ON(!six_trylock_write(&b->c.lock));
-
-got_node:
-       /*
-        * btree_free() doesn't free memory; it sticks the node on the end of
-        * the list. Check if there's any freed nodes there:
-        */
-       list_for_each_entry(b2, &bc->freeable, list)
-               if (!btree_node_reclaim(c, b2)) {
-                       swap(b->data, b2->data);
-                       swap(b->aux_data, b2->aux_data);
-
-                       list_del_init(&b2->list);
-                       --bc->nr_freeable;
-                       btree_node_to_freedlist(bc, b2);
-                       mutex_unlock(&bc->lock);
-
-                       six_unlock_write(&b2->c.lock);
-                       six_unlock_intent(&b2->c.lock);
-                       goto got_mem;
-               }
-
-       mutex_unlock(&bc->lock);
-
-       if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
-               bch2_trans_unlock(trans);
-               if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
-                       goto err;
-       }
-
-got_mem:
-       BUG_ON(!list_empty(&b->list));
-       BUG_ON(btree_node_hashed(b));
-       BUG_ON(btree_node_dirty(b));
-       BUG_ON(btree_node_write_in_flight(b));
-out:
-       b->flags                = 0;
-       b->written              = 0;
-       b->nsets                = 0;
-       b->sib_u64s[0]          = 0;
-       b->sib_u64s[1]          = 0;
-       b->whiteout_u64s        = 0;
-       bch2_btree_keys_init(b);
-
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
-                              start_time);
-
-       int ret = bch2_trans_relock(trans);
-       if (unlikely(ret)) {
-               bch2_btree_node_to_freelist(c, b);
-               return ERR_PTR(ret);
-       }
-
-       return b;
-err:
-       mutex_lock(&bc->lock);
-
-       /* Try to cannibalize another cached btree node: */
-       if (bc->alloc_lock == current) {
-               b2 = btree_node_cannibalize(c);
-               clear_btree_node_just_written(b2);
-               __bch2_btree_node_hash_remove(bc, b2);
-
-               if (b) {
-                       swap(b->data, b2->data);
-                       swap(b->aux_data, b2->aux_data);
-                       btree_node_to_freedlist(bc, b2);
-                       six_unlock_write(&b2->c.lock);
-                       six_unlock_intent(&b2->c.lock);
-               } else {
-                       b = b2;
-               }
-
-               BUG_ON(!list_empty(&b->list));
-               mutex_unlock(&bc->lock);
-
-               trace_and_count(c, btree_cache_cannibalize, trans);
-               goto out;
-       }
-
-       mutex_unlock(&bc->lock);
-       return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
-}
-
-/* Slowpath, don't want it inlined into btree_iter_traverse() */
-static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
-                               struct btree_path *path,
-                               const struct bkey_i *k,
-                               enum btree_id btree_id,
-                               unsigned level,
-                               enum six_lock_type lock_type,
-                               bool sync)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_cache *bc = &c->btree_cache;
-       struct btree *b;
-
-       if (unlikely(level >= BTREE_MAX_DEPTH)) {
-               int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
-                                                level, BTREE_MAX_DEPTH);
-               return ERR_PTR(ret);
-       }
-
-       if (unlikely(!bkey_is_btree_ptr(&k->k))) {
-               struct printbuf buf = PRINTBUF;
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
-               int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
-               printbuf_exit(&buf);
-               return ERR_PTR(ret);
-       }
-
-       if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
-               struct printbuf buf = PRINTBUF;
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
-               int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
-               printbuf_exit(&buf);
-               return ERR_PTR(ret);
-       }
-
-       /*
-        * Parent node must be locked, else we could read in a btree node that's
-        * been freed:
-        */
-       if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
-               trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
-               return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
-       }
-
-       b = bch2_btree_node_mem_alloc(trans, level != 0);
-
-       if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
-               if (!path)
-                       return b;
-
-               trans->memory_allocation_failure = true;
-               trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
-               return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
-       }
-
-       if (IS_ERR(b))
-               return b;
-
-       bkey_copy(&b->key, k);
-       if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
-               /* raced with another fill: */
-
-               /* mark as unhashed... */
-               b->hash_val = 0;
-
-               mutex_lock(&bc->lock);
-               __bch2_btree_node_to_freelist(bc, b);
-               mutex_unlock(&bc->lock);
-
-               six_unlock_write(&b->c.lock);
-               six_unlock_intent(&b->c.lock);
-               return NULL;
-       }
-
-       set_btree_node_read_in_flight(b);
-       six_unlock_write(&b->c.lock);
-
-       if (path) {
-               u32 seq = six_lock_seq(&b->c.lock);
-
-               /* Unlock before doing IO: */
-               six_unlock_intent(&b->c.lock);
-               bch2_trans_unlock(trans);
-
-               bch2_btree_node_read(trans, b, sync);
-
-               int ret = bch2_trans_relock(trans);
-               if (ret)
-                       return ERR_PTR(ret);
-
-               if (!sync)
-                       return NULL;
-
-               if (!six_relock_type(&b->c.lock, lock_type, seq))
-                       b = NULL;
-       } else {
-               bch2_btree_node_read(trans, b, sync);
-               if (lock_type == SIX_LOCK_read)
-                       six_lock_downgrade(&b->c.lock);
-       }
-
-       return b;
-}
-
-static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
-{
-       struct printbuf buf = PRINTBUF;
-
-       if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
-               return;
-
-       prt_printf(&buf,
-                  "btree node header doesn't match ptr: ");
-       bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-       prt_str(&buf, "\nptr: ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-       prt_str(&buf, "\nheader: ");
-       bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data));
-       prt_str(&buf, "\nmin ");
-       bch2_bpos_to_text(&buf, b->data->min_key);
-
-       prt_printf(&buf, "\nmax ");
-       bch2_bpos_to_text(&buf, b->data->max_key);
-
-       bch2_fs_topology_error(c, "%s", buf.buf);
-
-       printbuf_exit(&buf);
-}
-
-static inline void btree_check_header(struct bch_fs *c, struct btree *b)
-{
-       if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
-           b->c.level != BTREE_NODE_LEVEL(b->data) ||
-           !bpos_eq(b->data->max_key, b->key.k.p) ||
-           (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-            !bpos_eq(b->data->min_key,
-                     bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
-               btree_bad_header(c, b);
-}
-
-static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
-                                          const struct bkey_i *k, unsigned level,
-                                          enum six_lock_type lock_type,
-                                          unsigned long trace_ip)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_cache *bc = &c->btree_cache;
-       struct btree *b;
-       bool need_relock = false;
-       int ret;
-
-       EBUG_ON(level >= BTREE_MAX_DEPTH);
-retry:
-       b = btree_cache_find(bc, k);
-       if (unlikely(!b)) {
-               /*
-                * We must have the parent locked to call bch2_btree_node_fill(),
-                * else we could read in a btree node from disk that's been
-                * freed:
-                */
-               b = bch2_btree_node_fill(trans, path, k, path->btree_id,
-                                        level, lock_type, true);
-               need_relock = true;
-
-               /* We raced and found the btree node in the cache */
-               if (!b)
-                       goto retry;
-
-               if (IS_ERR(b))
-                       return b;
-       } else {
-               if (btree_node_read_locked(path, level + 1))
-                       btree_node_unlock(trans, path, level + 1);
-
-               ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       return ERR_PTR(ret);
-
-               BUG_ON(ret);
-
-               if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
-                            b->c.level != level ||
-                            race_fault())) {
-                       six_unlock_type(&b->c.lock, lock_type);
-                       if (bch2_btree_node_relock(trans, path, level + 1))
-                               goto retry;
-
-                       trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
-                       return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
-               }
-
-               /* avoid atomic set bit if it's not needed: */
-               if (!btree_node_accessed(b))
-                       set_btree_node_accessed(b);
-       }
-
-       if (unlikely(btree_node_read_in_flight(b))) {
-               u32 seq = six_lock_seq(&b->c.lock);
-
-               six_unlock_type(&b->c.lock, lock_type);
-               bch2_trans_unlock(trans);
-               need_relock = true;
-
-               bch2_btree_node_wait_on_read(b);
-
-               ret = bch2_trans_relock(trans);
-               if (ret)
-                       return ERR_PTR(ret);
-
-               /*
-                * should_be_locked is not set on this path yet, so we need to
-                * relock it specifically:
-                */
-               if (!six_relock_type(&b->c.lock, lock_type, seq))
-                       goto retry;
-       }
-
-       if (unlikely(need_relock)) {
-               ret = bch2_trans_relock(trans) ?:
-                       bch2_btree_path_relock_intent(trans, path);
-               if (ret) {
-                       six_unlock_type(&b->c.lock, lock_type);
-                       return ERR_PTR(ret);
-               }
-       }
-
-       prefetch(b->aux_data);
-
-       for_each_bset(b, t) {
-               void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
-               prefetch(p + L1_CACHE_BYTES * 0);
-               prefetch(p + L1_CACHE_BYTES * 1);
-               prefetch(p + L1_CACHE_BYTES * 2);
-       }
-
-       if (unlikely(btree_node_read_error(b))) {
-               six_unlock_type(&b->c.lock, lock_type);
-               return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
-       }
-
-       EBUG_ON(b->c.btree_id != path->btree_id);
-       EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-       btree_check_header(c, b);
-
-       return b;
-}
-
-/**
- * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
- * in from disk if necessary.
- *
- * @trans:     btree transaction object
- * @path:      btree_path being traversed
- * @k:         pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
- * @level:     level of btree node being looked up (0 == leaf node)
- * @lock_type: SIX_LOCK_read or SIX_LOCK_intent
- * @trace_ip:  ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
- *
- * The btree node will have either a read or a write lock held, depending on
- * the @write parameter.
- *
- * Returns: btree node or ERR_PTR()
- */
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
-                                 const struct bkey_i *k, unsigned level,
-                                 enum six_lock_type lock_type,
-                                 unsigned long trace_ip)
-{
-       struct bch_fs *c = trans->c;
-       struct btree *b;
-       int ret;
-
-       EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-       b = btree_node_mem_ptr(k);
-
-       /*
-        * Check b->hash_val _before_ calling btree_node_lock() - this might not
-        * be the node we want anymore, and trying to lock the wrong node could
-        * cause an unneccessary transaction restart:
-        */
-       if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
-                    !b ||
-                    b->hash_val != btree_ptr_hash_val(k)))
-               return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-
-       if (btree_node_read_locked(path, level + 1))
-               btree_node_unlock(trans, path, level + 1);
-
-       ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               return ERR_PTR(ret);
-
-       BUG_ON(ret);
-
-       if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
-                    b->c.level != level ||
-                    race_fault())) {
-               six_unlock_type(&b->c.lock, lock_type);
-               if (bch2_btree_node_relock(trans, path, level + 1))
-                       return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-
-               trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
-               return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
-       }
-
-       if (unlikely(btree_node_read_in_flight(b))) {
-               six_unlock_type(&b->c.lock, lock_type);
-               return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-       }
-
-       prefetch(b->aux_data);
-
-       for_each_bset(b, t) {
-               void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
-               prefetch(p + L1_CACHE_BYTES * 0);
-               prefetch(p + L1_CACHE_BYTES * 1);
-               prefetch(p + L1_CACHE_BYTES * 2);
-       }
-
-       /* avoid atomic set bit if it's not needed: */
-       if (!btree_node_accessed(b))
-               set_btree_node_accessed(b);
-
-       if (unlikely(btree_node_read_error(b))) {
-               six_unlock_type(&b->c.lock, lock_type);
-               return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
-       }
-
-       EBUG_ON(b->c.btree_id != path->btree_id);
-       EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-       btree_check_header(c, b);
-
-       return b;
-}
-
-struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
-                                        const struct bkey_i *k,
-                                        enum btree_id btree_id,
-                                        unsigned level,
-                                        bool nofill)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_cache *bc = &c->btree_cache;
-       struct btree *b;
-       int ret;
-
-       EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-       if (c->opts.btree_node_mem_ptr_optimization) {
-               b = btree_node_mem_ptr(k);
-               if (b)
-                       goto lock_node;
-       }
-retry:
-       b = btree_cache_find(bc, k);
-       if (unlikely(!b)) {
-               if (nofill)
-                       goto out;
-
-               b = bch2_btree_node_fill(trans, NULL, k, btree_id,
-                                        level, SIX_LOCK_read, true);
-
-               /* We raced and found the btree node in the cache */
-               if (!b)
-                       goto retry;
-
-               if (IS_ERR(b) &&
-                   !bch2_btree_cache_cannibalize_lock(trans, NULL))
-                       goto retry;
-
-               if (IS_ERR(b))
-                       goto out;
-       } else {
-lock_node:
-               ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       return ERR_PTR(ret);
-
-               BUG_ON(ret);
-
-               if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
-                            b->c.btree_id != btree_id ||
-                            b->c.level != level)) {
-                       six_unlock_read(&b->c.lock);
-                       goto retry;
-               }
-
-               /* avoid atomic set bit if it's not needed: */
-               if (!btree_node_accessed(b))
-                       set_btree_node_accessed(b);
-       }
-
-       /* XXX: waiting on IO with btree locks held: */
-       __bch2_btree_node_wait_on_read(b);
-
-       prefetch(b->aux_data);
-
-       for_each_bset(b, t) {
-               void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
-               prefetch(p + L1_CACHE_BYTES * 0);
-               prefetch(p + L1_CACHE_BYTES * 1);
-               prefetch(p + L1_CACHE_BYTES * 2);
-       }
-
-       if (unlikely(btree_node_read_error(b))) {
-               six_unlock_read(&b->c.lock);
-               b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
-               goto out;
-       }
-
-       EBUG_ON(b->c.btree_id != btree_id);
-       EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-       btree_check_header(c, b);
-out:
-       bch2_btree_cache_cannibalize_unlock(trans);
-       return b;
-}
-
-int bch2_btree_node_prefetch(struct btree_trans *trans,
-                            struct btree_path *path,
-                            const struct bkey_i *k,
-                            enum btree_id btree_id, unsigned level)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_cache *bc = &c->btree_cache;
-
-       BUG_ON(path && !btree_node_locked(path, level + 1));
-       BUG_ON(level >= BTREE_MAX_DEPTH);
-
-       struct btree *b = btree_cache_find(bc, k);
-       if (b)
-               return 0;
-
-       b = bch2_btree_node_fill(trans, path, k, btree_id,
-                                level, SIX_LOCK_read, false);
-       int ret = PTR_ERR_OR_ZERO(b);
-       if (ret)
-               return ret;
-       if (b)
-               six_unlock_read(&b->c.lock);
-       return 0;
-}
-
-void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_cache *bc = &c->btree_cache;
-       struct btree *b;
-
-       b = btree_cache_find(bc, k);
-       if (!b)
-               return;
-
-       BUG_ON(b == btree_node_root(trans->c, b));
-wait_on_io:
-       /* not allowed to wait on io with btree locks held: */
-
-       /* XXX we're called from btree_gc which will be holding other btree
-        * nodes locked
-        */
-       __bch2_btree_node_wait_on_read(b);
-       __bch2_btree_node_wait_on_write(b);
-
-       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-       if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
-               goto out;
-
-       if (btree_node_dirty(b)) {
-               __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
-               six_unlock_write(&b->c.lock);
-               six_unlock_intent(&b->c.lock);
-               goto wait_on_io;
-       }
-
-       BUG_ON(btree_node_dirty(b));
-
-       mutex_lock(&bc->lock);
-       bch2_btree_node_hash_remove(bc, b);
-       btree_node_data_free(bc, b);
-       mutex_unlock(&bc->lock);
-out:
-       six_unlock_write(&b->c.lock);
-       six_unlock_intent(&b->c.lock);
-}
-
-const char *bch2_btree_id_str(enum btree_id btree)
-{
-       return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
-}
-
-void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
-{
-       if (btree < BTREE_ID_NR)
-               prt_str(out, __bch2_btree_ids[btree]);
-       else
-               prt_printf(out, "(unknown btree %u)", btree);
-}
-
-void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level)
-{
-       prt_str(out, "btree=");
-       bch2_btree_id_to_text(out, btree);
-       prt_printf(out, " level=%u", level);
-}
-
-void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
-                             enum btree_id btree, unsigned level, struct bkey_s_c k)
-{
-       bch2_btree_id_to_text(out, btree);
-       prt_printf(out, " level %u/", level);
-       struct btree_root *r = bch2_btree_id_root(c, btree);
-       if (r)
-               prt_printf(out, "%u", r->level);
-       else
-               prt_printf(out, "(unknown)");
-       prt_newline(out);
-
-       bch2_bkey_val_to_text(out, c, k);
-}
-
-void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
-{
-       __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key));
-}
-
-void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
-{
-       struct bset_stats stats;
-
-       memset(&stats, 0, sizeof(stats));
-
-       bch2_btree_keys_stats(b, &stats);
-
-       prt_printf(out, "l %u ", b->c.level);
-       bch2_bpos_to_text(out, b->data->min_key);
-       prt_printf(out, " - ");
-       bch2_bpos_to_text(out, b->data->max_key);
-       prt_printf(out, ":\n"
-              "    ptrs: ");
-       bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-       prt_newline(out);
-
-       prt_printf(out,
-              "    format: ");
-       bch2_bkey_format_to_text(out, &b->format);
-
-       prt_printf(out,
-              "    unpack fn len: %u\n"
-              "    bytes used %zu/%zu (%zu%% full)\n"
-              "    sib u64s: %u, %u (merge threshold %u)\n"
-              "    nr packed keys %u\n"
-              "    nr unpacked keys %u\n"
-              "    floats %zu\n"
-              "    failed unpacked %zu\n",
-              b->unpack_fn_len,
-              b->nr.live_u64s * sizeof(u64),
-              btree_buf_bytes(b) - sizeof(struct btree_node),
-              b->nr.live_u64s * 100 / btree_max_u64s(c),
-              b->sib_u64s[0],
-              b->sib_u64s[1],
-              c->btree_foreground_merge_threshold,
-              b->nr.packed_keys,
-              b->nr.unpacked_keys,
-              stats.floats,
-              stats.failed);
-}
-
-static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
-                                const char *label, size_t nr)
-{
-       prt_printf(out, "%s\t", label);
-       prt_human_readable_u64(out, nr * c->opts.btree_node_size);
-       prt_printf(out, " (%zu)\n", nr);
-}
-
-static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
-#define x(n) #n,
-       BCH_BTREE_CACHE_NOT_FREED_REASONS()
-#undef x
-       NULL
-};
-
-void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
-{
-       struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
-
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 32);
-
-       prt_btree_cache_line(out, c, "live:",           bc->live[0].nr);
-       prt_btree_cache_line(out, c, "pinned:",         bc->live[1].nr);
-       prt_btree_cache_line(out, c, "reserve:",        bc->nr_reserve);
-       prt_btree_cache_line(out, c, "freed:",          bc->nr_freeable);
-       prt_btree_cache_line(out, c, "dirty:",          atomic_long_read(&bc->nr_dirty));
-       prt_printf(out, "cannibalize lock:\t%s\n",      bc->alloc_lock ? "held" : "not held");
-       prt_newline(out);
-
-       for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
-               bch2_btree_id_to_text(out, i);
-               prt_printf(out, "\t");
-               prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size);
-               prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]);
-       }
-
-       prt_newline(out);
-       prt_printf(out, "counters since mount:\n");
-       prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
-       prt_printf(out, "not freed:\n");
-
-       for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
-               prt_printf(out, "  %s\t%llu\n",
-                          bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
-}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
deleted file mode 100644 (file)
index be275f8..0000000
+++ /dev/null
@@ -1,157 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_CACHE_H
-#define _BCACHEFS_BTREE_CACHE_H
-
-#include "bcachefs.h"
-#include "btree_types.h"
-#include "bkey_methods.h"
-
-extern const char * const bch2_btree_node_flags[];
-
-struct btree_iter;
-
-void bch2_recalc_btree_reserve(struct bch_fs *);
-
-void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
-
-void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
-void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
-
-int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
-int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
-                               unsigned, enum btree_id);
-
-void bch2_node_pin(struct bch_fs *, struct btree *);
-void bch2_btree_cache_unpin(struct bch_fs *);
-
-void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
-                                     struct bkey_s_c, struct bkey_i *);
-
-void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
-int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
-
-void __btree_node_data_free(struct btree *);
-struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
-
-struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
-                                 const struct bkey_i *, unsigned,
-                                 enum six_lock_type, unsigned long);
-
-struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
-                                        enum btree_id, unsigned, bool);
-
-int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
-                            const struct bkey_i *, enum btree_id, unsigned);
-
-void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
-
-void bch2_fs_btree_cache_exit(struct bch_fs *);
-int bch2_fs_btree_cache_init(struct bch_fs *);
-void bch2_fs_btree_cache_init_early(struct btree_cache *);
-
-static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
-{
-       switch (k->k.type) {
-       case KEY_TYPE_btree_ptr:
-               return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
-       case KEY_TYPE_btree_ptr_v2:
-               /*
-                * The cast/deref is only necessary to avoid sparse endianness
-                * warnings:
-                */
-               return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
-       default:
-               return 0;
-       }
-}
-
-static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
-{
-       return k->k.type == KEY_TYPE_btree_ptr_v2
-               ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
-               : NULL;
-}
-
-/* is btree node in hash table? */
-static inline bool btree_node_hashed(struct btree *b)
-{
-       return b->hash_val != 0;
-}
-
-#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)               \
-       for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,  \
-                                         &(_c)->btree_cache.table),    \
-            _iter = 0; _iter < (_tbl)->size; _iter++)                  \
-               rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-
-static inline size_t btree_buf_bytes(const struct btree *b)
-{
-       return 1UL << b->byte_order;
-}
-
-static inline size_t btree_buf_max_u64s(const struct btree *b)
-{
-       return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_max_u64s(const struct bch_fs *c)
-{
-       return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
-       return c->opts.btree_node_size >> SECTOR_SHIFT;
-}
-
-static inline unsigned btree_blocks(const struct bch_fs *c)
-{
-       return btree_sectors(c) >> c->block_bits;
-}
-
-#define BTREE_SPLIT_THRESHOLD(c)               (btree_max_u64s(c) * 2 / 3)
-
-#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)    (btree_max_u64s(c) * 1 / 3)
-#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)                   \
-       (BTREE_FOREGROUND_MERGE_THRESHOLD(c) +                  \
-        (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
-
-static inline unsigned btree_id_nr_alive(struct bch_fs *c)
-{
-       return BTREE_ID_NR + c->btree_roots_extra.nr;
-}
-
-static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id)
-{
-       if (likely(id < BTREE_ID_NR)) {
-               return &c->btree_roots_known[id];
-       } else {
-               unsigned idx = id - BTREE_ID_NR;
-
-               /* This can happen when we're called from btree_node_scan */
-               if (idx >= c->btree_roots_extra.nr)
-                       return NULL;
-
-               return &c->btree_roots_extra.data[idx];
-       }
-}
-
-static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
-{
-       struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
-
-       return r ? r->b : NULL;
-}
-
-const char *bch2_btree_id_str(enum btree_id);  /* avoid */
-void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
-void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned);
-
-void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *,
-                             enum btree_id, unsigned, struct bkey_s_c);
-void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
-
-#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
deleted file mode 100644 (file)
index bac108e..0000000
+++ /dev/null
@@ -1,1308 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright (C) 2014 Datera Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_methods.h"
-#include "bkey_buf.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "progress.h"
-#include "recovery_passes.h"
-#include "reflink.h"
-#include "recovery.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/slab.h>
-#include <linux/bitops.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-
-#define DROP_THIS_NODE         10
-#define DROP_PREV_NODE         11
-#define DID_FILL_FROM_SCAN     12
-
-/*
- * Returns true if it's a btree we can easily reconstruct, or otherwise won't
- * cause data loss if it's missing:
- */
-static bool btree_id_important(enum btree_id btree)
-{
-       if (btree_id_is_alloc(btree))
-               return false;
-
-       switch (btree) {
-       case BTREE_ID_quotas:
-       case BTREE_ID_snapshot_trees:
-       case BTREE_ID_logged_ops:
-       case BTREE_ID_rebalance_work:
-       case BTREE_ID_subvolume_children:
-               return false;
-       default:
-               return true;
-       }
-}
-
-static const char * const bch2_gc_phase_strs[] = {
-#define x(n)   #n,
-       GC_PHASES()
-#undef x
-       NULL
-};
-
-void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
-{
-       prt_str(out, bch2_gc_phase_strs[p->phase]);
-       prt_char(out, ' ');
-       bch2_btree_id_level_to_text(out, p->btree, p->level);
-       prt_char(out, ' ');
-       bch2_bpos_to_text(out, p->pos);
-}
-
-static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
-{
-       return (struct bkey_s) {{{
-               (struct bkey *) k.k,
-               (struct bch_val *) k.v
-       }}};
-}
-
-static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
-       preempt_disable();
-       write_seqcount_begin(&c->gc_pos_lock);
-       c->gc_pos = new_pos;
-       write_seqcount_end(&c->gc_pos_lock);
-       preempt_enable();
-}
-
-static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
-       BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
-       __gc_pos_set(c, new_pos);
-}
-
-static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
-{
-       switch (b->key.k.type) {
-       case KEY_TYPE_btree_ptr: {
-               struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
-
-               dst->k.p                = src->k.p;
-               dst->v.mem_ptr          = 0;
-               dst->v.seq              = b->data->keys.seq;
-               dst->v.sectors_written  = 0;
-               dst->v.flags            = 0;
-               dst->v.min_key          = b->data->min_key;
-               set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
-               memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
-               break;
-       }
-       case KEY_TYPE_btree_ptr_v2:
-               bkey_copy(&dst->k_i, &b->key);
-               break;
-       default:
-               BUG();
-       }
-}
-
-static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
-{
-       struct bkey_i_btree_ptr_v2 *new;
-       int ret;
-
-       if (c->opts.verbose) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-               prt_str(&buf, " -> ");
-               bch2_bpos_to_text(&buf, new_min);
-
-               bch_info(c, "%s(): %s", __func__, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
-       if (!new)
-               return bch_err_throw(c, ENOMEM_gc_repair_key);
-
-       btree_ptr_to_v2(b, new);
-       b->data->min_key        = new_min;
-       new->v.min_key          = new_min;
-       SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
-
-       ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
-       if (ret) {
-               kfree(new);
-               return ret;
-       }
-
-       bch2_btree_node_drop_keys_outside_node(b);
-       bkey_copy(&b->key, &new->k_i);
-       return 0;
-}
-
-static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
-{
-       struct bkey_i_btree_ptr_v2 *new;
-       int ret;
-
-       if (c->opts.verbose) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-               prt_str(&buf, " -> ");
-               bch2_bpos_to_text(&buf, new_max);
-
-               bch_info(c, "%s(): %s", __func__, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
-       if (ret)
-               return ret;
-
-       new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
-       if (!new)
-               return bch_err_throw(c, ENOMEM_gc_repair_key);
-
-       btree_ptr_to_v2(b, new);
-       b->data->max_key        = new_max;
-       new->k.p                = new_max;
-       SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
-
-       ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
-       if (ret) {
-               kfree(new);
-               return ret;
-       }
-
-       bch2_btree_node_drop_keys_outside_node(b);
-
-       mutex_lock(&c->btree_cache.lock);
-       __bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-       bkey_copy(&b->key, &new->k_i);
-       ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-       BUG_ON(ret);
-       mutex_unlock(&c->btree_cache.lock);
-       return 0;
-}
-
-static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
-                                      struct btree *prev, struct btree *cur,
-                                      struct bpos *pulled_from_scan)
-{
-       struct bch_fs *c = trans->c;
-       struct bpos expected_start = !prev
-               ? b->data->min_key
-               : bpos_successor(prev->key.k.p);
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-              !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
-                       b->data->min_key));
-
-       if (bpos_eq(expected_start, cur->data->min_key))
-               return 0;
-
-       prt_printf(&buf, "  at ");
-       bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-       prt_printf(&buf, ":\nparent: ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-       if (prev) {
-               prt_printf(&buf, "\nprev: ");
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
-       }
-
-       prt_str(&buf, "\nnext: ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
-
-       if (bpos_lt(expected_start, cur->data->min_key)) {                              /* gap */
-               if (b->c.level == 1 &&
-                   bpos_lt(*pulled_from_scan, cur->data->min_key)) {
-                       ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
-                                                    expected_start,
-                                                    bpos_predecessor(cur->data->min_key));
-                       if (ret)
-                               goto err;
-
-                       *pulled_from_scan = cur->data->min_key;
-                       ret = DID_FILL_FROM_SCAN;
-               } else {
-                       if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
-                                            "btree node with incorrect min_key%s", buf.buf))
-                               ret = set_node_min(c, cur, expected_start);
-               }
-       } else {                                                                        /* overlap */
-               if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {   /* cur overwrites prev */
-                       if (bpos_ge(prev->data->min_key, cur->data->min_key)) {         /* fully? */
-                               if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
-                                                    "btree node overwritten by next node%s", buf.buf))
-                                       ret = DROP_PREV_NODE;
-                       } else {
-                               if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
-                                                    "btree node with incorrect max_key%s", buf.buf))
-                                       ret = set_node_max(c, prev,
-                                                          bpos_predecessor(cur->data->min_key));
-                       }
-               } else {
-                       if (bpos_ge(expected_start, cur->data->max_key)) {              /* fully? */
-                               if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
-                                                    "btree node overwritten by prev node%s", buf.buf))
-                                       ret = DROP_THIS_NODE;
-                       } else {
-                               if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
-                                                    "btree node with incorrect min_key%s", buf.buf))
-                                       ret = set_node_min(c, cur, expected_start);
-                       }
-               }
-       }
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
-                                struct btree *child, struct bpos *pulled_from_scan)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       if (bpos_eq(child->key.k.p, b->key.k.p))
-               return 0;
-
-       prt_printf(&buf, "\nat: ");
-       bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-       prt_printf(&buf, "\nparent: ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-       prt_str(&buf, "\nchild: ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
-
-       if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
-                            "btree node with incorrect max_key%s", buf.buf)) {
-               if (b->c.level == 1 &&
-                   bpos_lt(*pulled_from_scan, b->key.k.p)) {
-                       ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
-                                               bpos_successor(child->key.k.p), b->key.k.p);
-                       if (ret)
-                               goto err;
-
-                       *pulled_from_scan = b->key.k.p;
-                       ret = DID_FILL_FROM_SCAN;
-               } else {
-                       ret = set_node_max(c, child, b->key.k.p);
-               }
-       }
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
-                                             struct bpos *pulled_from_scan)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_and_journal_iter iter;
-       struct bkey_s_c k;
-       struct bkey_buf prev_k, cur_k;
-       struct btree *prev = NULL, *cur = NULL;
-       bool have_child, new_pass = false;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       if (!b->c.level)
-               return 0;
-
-       bch2_bkey_buf_init(&prev_k);
-       bch2_bkey_buf_init(&cur_k);
-again:
-       cur = prev = NULL;
-       have_child = new_pass = false;
-       bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-       iter.prefetch = true;
-
-       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-               BUG_ON(bpos_lt(k.k->p, b->data->min_key));
-               BUG_ON(bpos_gt(k.k->p, b->data->max_key));
-
-               bch2_btree_and_journal_iter_advance(&iter);
-               bch2_bkey_buf_reassemble(&cur_k, c, k);
-
-               cur = bch2_btree_node_get_noiter(trans, cur_k.k,
-                                       b->c.btree_id, b->c.level - 1,
-                                       false);
-               ret = PTR_ERR_OR_ZERO(cur);
-
-               printbuf_reset(&buf);
-               bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
-               prt_char(&buf, ' ');
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
-
-               if (bch2_err_matches(ret, EIO)) {
-                       bch2_btree_node_evict(trans, cur_k.k);
-                       cur = NULL;
-                       ret = bch2_journal_key_delete(c, b->c.btree_id,
-                                                     b->c.level, cur_k.k->k.p);
-                       if (ret)
-                               break;
-                       continue;
-               }
-
-               bch_err_msg(c, ret, "getting btree node");
-               if (ret)
-                       break;
-
-               if (bch2_btree_node_is_stale(c, cur)) {
-                       bch_info(c, "btree node older than nodes found by scanning\n  %s", buf.buf);
-                       six_unlock_read(&cur->c.lock);
-                       bch2_btree_node_evict(trans, cur_k.k);
-                       ret = bch2_journal_key_delete(c, b->c.btree_id,
-                                                     b->c.level, cur_k.k->k.p);
-                       cur = NULL;
-                       if (ret)
-                               break;
-                       continue;
-               }
-
-               ret = lockrestart_do(trans,
-                       btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan));
-               if (ret < 0)
-                       goto err;
-
-               if (ret == DID_FILL_FROM_SCAN) {
-                       new_pass = true;
-                       ret = 0;
-               }
-
-               if (ret == DROP_THIS_NODE) {
-                       six_unlock_read(&cur->c.lock);
-                       bch2_btree_node_evict(trans, cur_k.k);
-                       ret = bch2_journal_key_delete(c, b->c.btree_id,
-                                                     b->c.level, cur_k.k->k.p);
-                       cur = NULL;
-                       if (ret)
-                               break;
-                       continue;
-               }
-
-               if (prev)
-                       six_unlock_read(&prev->c.lock);
-               prev = NULL;
-
-               if (ret == DROP_PREV_NODE) {
-                       bch_info(c, "dropped prev node");
-                       bch2_btree_node_evict(trans, prev_k.k);
-                       ret = bch2_journal_key_delete(c, b->c.btree_id,
-                                                     b->c.level, prev_k.k->k.p);
-                       if (ret)
-                               break;
-
-                       bch2_btree_and_journal_iter_exit(&iter);
-                       goto again;
-               } else if (ret)
-                       break;
-
-               prev = cur;
-               cur = NULL;
-               bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
-       }
-
-       if (!ret && !IS_ERR_OR_NULL(prev)) {
-               BUG_ON(cur);
-               ret = lockrestart_do(trans,
-                       btree_repair_node_end(trans, b, prev, pulled_from_scan));
-               if (ret == DID_FILL_FROM_SCAN) {
-                       new_pass = true;
-                       ret = 0;
-               }
-       }
-
-       if (!IS_ERR_OR_NULL(prev))
-               six_unlock_read(&prev->c.lock);
-       prev = NULL;
-       if (!IS_ERR_OR_NULL(cur))
-               six_unlock_read(&cur->c.lock);
-       cur = NULL;
-
-       if (ret)
-               goto err;
-
-       bch2_btree_and_journal_iter_exit(&iter);
-
-       if (new_pass)
-               goto again;
-
-       bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-       iter.prefetch = true;
-
-       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-               bch2_bkey_buf_reassemble(&cur_k, c, k);
-               bch2_btree_and_journal_iter_advance(&iter);
-
-               cur = bch2_btree_node_get_noiter(trans, cur_k.k,
-                                       b->c.btree_id, b->c.level - 1,
-                                       false);
-               ret = PTR_ERR_OR_ZERO(cur);
-
-               bch_err_msg(c, ret, "getting btree node");
-               if (ret)
-                       goto err;
-
-               ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
-               six_unlock_read(&cur->c.lock);
-               cur = NULL;
-
-               if (ret == DROP_THIS_NODE) {
-                       bch2_btree_node_evict(trans, cur_k.k);
-                       ret = bch2_journal_key_delete(c, b->c.btree_id,
-                                                     b->c.level, cur_k.k->k.p);
-                       new_pass = true;
-               }
-
-               if (ret)
-                       goto err;
-
-               have_child = true;
-       }
-
-       printbuf_reset(&buf);
-       bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-       prt_newline(&buf);
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-       /*
-        * XXX: we're not passing the trans object here because we're not set up
-        * to handle a transaction restart - this code needs to be rewritten
-        * when we start doing online topology repair
-        */
-       bch2_trans_unlock_long(trans);
-       if (mustfix_fsck_err_on(!have_child,
-                       c, btree_node_topology_interior_node_empty,
-                       "empty interior btree node at %s", buf.buf))
-               ret = DROP_THIS_NODE;
-err:
-fsck_err:
-       if (!IS_ERR_OR_NULL(prev))
-               six_unlock_read(&prev->c.lock);
-       if (!IS_ERR_OR_NULL(cur))
-               six_unlock_read(&cur->c.lock);
-
-       bch2_btree_and_journal_iter_exit(&iter);
-
-       if (!ret && new_pass)
-               goto again;
-
-       BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
-
-       bch2_bkey_buf_exit(&prev_k, c);
-       bch2_bkey_buf_exit(&cur_k, c);
-       printbuf_exit(&buf);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
-                          bool *reconstructed_root)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_root *r = bch2_btree_id_root(c, btree);
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       bch2_btree_id_to_text(&buf, btree);
-
-       if (r->error) {
-               bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
-
-               ret = bch2_btree_has_scanned_nodes(c, btree);
-               if (ret < 0)
-                       goto err;
-
-               if (!ret) {
-                       __fsck_err(trans,
-                                  FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
-                                  btree_root_unreadable_and_scan_found_nothing,
-                                  "no nodes found for btree %s, continue?", buf.buf);
-
-                       r->alive = false;
-                       r->error = 0;
-                       bch2_btree_root_alloc_fake_trans(trans, btree, 0);
-               } else {
-                       r->alive = false;
-                       r->error = 0;
-                       bch2_btree_root_alloc_fake_trans(trans, btree, 1);
-
-                       bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-                       ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
-                       if (ret)
-                               goto err;
-               }
-
-               *reconstructed_root = true;
-       }
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-int bch2_check_topology(struct bch_fs *c)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct bpos pulled_from_scan = POS_MIN;
-       int ret = 0;
-
-       bch2_trans_srcu_unlock(trans);
-
-       for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
-               bool reconstructed_root = false;
-recover:
-               ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root));
-               if (ret)
-                       break;
-
-               struct btree_root *r = bch2_btree_id_root(c, i);
-               struct btree *b = r->b;
-
-               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-               ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
-               six_unlock_read(&b->c.lock);
-
-               if (ret == DROP_THIS_NODE) {
-                       mutex_lock(&c->btree_cache.lock);
-                       bch2_btree_node_hash_remove(&c->btree_cache, b);
-                       mutex_unlock(&c->btree_cache.lock);
-
-                       r->b = NULL;
-
-                       if (!reconstructed_root) {
-                               r->error = -EIO;
-                               goto recover;
-                       }
-
-                       struct printbuf buf = PRINTBUF;
-                       bch2_btree_id_to_text(&buf, i);
-                       bch_err(c, "empty btree root %s", buf.buf);
-                       printbuf_exit(&buf);
-                       bch2_btree_root_alloc_fake_trans(trans, i, 0);
-                       r->alive = false;
-                       ret = 0;
-               }
-       }
-
-       bch2_trans_put(trans);
-       return ret;
-}
-
-/* marking of btree keys/nodes: */
-
-static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
-                           unsigned level, struct btree **prev,
-                           struct btree_iter *iter, struct bkey_s_c k,
-                           bool initial)
-{
-       struct bch_fs *c = trans->c;
-
-       if (iter) {
-               struct btree_path *path = btree_iter_path(trans, iter);
-               struct btree *b = path_l(path)->b;
-
-               if (*prev != b) {
-                       int ret = bch2_btree_node_check_topology(trans, b);
-                       if (ret)
-                               return ret;
-               }
-               *prev = b;
-       }
-
-       struct bkey deleted = KEY(0, 0, 0);
-       struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       deleted.p = k.k->p;
-
-       if (initial) {
-               BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) &&
-                      k.k->bversion.lo > atomic64_read(&c->journal.seq));
-
-               if (fsck_err_on(btree_id != BTREE_ID_accounting &&
-                               k.k->bversion.lo > atomic64_read(&c->key_version),
-                               trans, bkey_version_in_future,
-                               "key version number higher than recorded %llu\n%s",
-                               atomic64_read(&c->key_version),
-                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-                       atomic64_set(&c->key_version, k.k->bversion.lo);
-       }
-
-       if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
-                               trans, btree_bitmap_not_marked,
-                               "btree ptr not marked in member info btree allocated bitmap\n%s",
-                               (printbuf_reset(&buf),
-                                bch2_bkey_val_to_text(&buf, c, k),
-                                buf.buf))) {
-               mutex_lock(&c->sb_lock);
-               bch2_dev_btree_bitmap_mark(c, k);
-               bch2_write_super(c);
-               mutex_unlock(&c->sb_lock);
-       }
-
-       /*
-        * We require a commit before key_trigger() because
-        * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
-        * wrong result if we run it multiple times.
-        */
-       unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
-
-       ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
-                              BTREE_TRIGGER_check_repair|flags);
-       if (ret)
-               goto out;
-
-       if (trans->nr_updates) {
-               ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
-                       -BCH_ERR_transaction_restart_nested;
-               goto out;
-       }
-
-       ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
-                              BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
-out:
-fsck_err:
-       printbuf_exit(&buf);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int bch2_gc_btree(struct btree_trans *trans,
-                        struct progress_indicator_state *progress,
-                        enum btree_id btree, bool initial)
-{
-       struct bch_fs *c = trans->c;
-       unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
-       int ret = 0;
-
-       /* We need to make sure every leaf node is readable before going RW */
-       if (initial)
-               target_depth = 0;
-
-       for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
-               struct btree *prev = NULL;
-               struct btree_iter iter;
-               bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
-                                         BTREE_ITER_prefetch);
-
-               ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-                       bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
-                       gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
-                       bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
-               }));
-               if (ret)
-                       goto err;
-       }
-
-       /* root */
-       do {
-retry_root:
-               bch2_trans_begin(trans);
-
-               struct btree_iter iter;
-               bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
-                                         0, bch2_btree_id_root(c, btree)->b->c.level, 0);
-               struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
-               ret = PTR_ERR_OR_ZERO(b);
-               if (ret)
-                       goto err_root;
-
-               if (b != btree_node_root(c, b)) {
-                       bch2_trans_iter_exit(trans, &iter);
-                       goto retry_root;
-               }
-
-               gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
-               struct bkey_s_c k = bkey_i_to_s_c(&b->key);
-               ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
-err_root:
-               bch2_trans_iter_exit(trans, &iter);
-       } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
-err:
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
-{
-       return cmp_int(gc_btree_order(l), gc_btree_order(r));
-}
-
-static int bch2_gc_btrees(struct bch_fs *c)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       struct progress_indicator_state progress;
-       bch2_progress_init(&progress, c, ~0ULL);
-
-       enum btree_id ids[BTREE_ID_NR];
-       for (unsigned i = 0; i < BTREE_ID_NR; i++)
-               ids[i] = i;
-       bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
-
-       for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
-               unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
-
-               if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
-                       continue;
-
-               ret = bch2_gc_btree(trans, &progress, btree, true);
-       }
-
-       printbuf_exit(&buf);
-       bch2_trans_put(trans);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int bch2_mark_superblocks(struct bch_fs *c)
-{
-       gc_pos_set(c, gc_phase(GC_PHASE_sb));
-
-       return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
-}
-
-static void bch2_gc_free(struct bch_fs *c)
-{
-       bch2_accounting_gc_free(c);
-
-       genradix_free(&c->reflink_gc_table);
-       genradix_free(&c->gc_stripes);
-
-       for_each_member_device(c, ca)
-               genradix_free(&ca->buckets_gc);
-}
-
-static int bch2_gc_start(struct bch_fs *c)
-{
-       for_each_member_device(c, ca) {
-               int ret = bch2_dev_usage_init(ca, true);
-               if (ret) {
-                       bch2_dev_put(ca);
-                       return ret;
-               }
-       }
-
-       return 0;
-}
-
-/* returns true if not equal */
-static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
-                                    struct bch_alloc_v4 r)
-{
-       return  l.gen != r.gen                          ||
-               l.oldest_gen != r.oldest_gen            ||
-               l.data_type != r.data_type              ||
-               l.dirty_sectors != r.dirty_sectors      ||
-               l.stripe_sectors != r.stripe_sectors    ||
-               l.cached_sectors != r.cached_sectors     ||
-               l.stripe_redundancy != r.stripe_redundancy ||
-               l.stripe != r.stripe;
-}
-
-static int bch2_alloc_write_key(struct btree_trans *trans,
-                               struct btree_iter *iter,
-                               struct bch_dev *ca,
-                               struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_i_alloc_v4 *a;
-       struct bch_alloc_v4 old_gc, gc, old_convert, new;
-       const struct bch_alloc_v4 *old;
-       int ret;
-
-       if (!bucket_valid(ca, k.k->p.offset))
-               return 0;
-
-       old = bch2_alloc_to_v4(k, &old_convert);
-       gc = new = *old;
-
-       __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
-
-       old_gc = gc;
-
-       if ((old->data_type == BCH_DATA_sb ||
-            old->data_type == BCH_DATA_journal) &&
-           !bch2_dev_is_online(ca)) {
-               gc.data_type = old->data_type;
-               gc.dirty_sectors = old->dirty_sectors;
-       }
-
-       /*
-        * gc.data_type doesn't yet include need_discard & need_gc_gen states -
-        * fix that here:
-        */
-       alloc_data_type_set(&gc, gc.data_type);
-       if (gc.data_type != old_gc.data_type ||
-           gc.dirty_sectors != old_gc.dirty_sectors) {
-               ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
-               if (ret)
-                       return ret;
-
-               /*
-                * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
-                * safe w.r.t. transaction restarts, so fixup the gc_bucket so
-                * we don't run it twice:
-                */
-               struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
-               gc_m->data_type = gc.data_type;
-               gc_m->dirty_sectors = gc.dirty_sectors;
-       }
-
-       if (fsck_err_on(new.data_type != gc.data_type,
-                       trans, alloc_key_data_type_wrong,
-                       "bucket %llu:%llu gen %u has wrong data_type"
-                       ": got %s, should be %s",
-                       iter->pos.inode, iter->pos.offset,
-                       gc.gen,
-                       bch2_data_type_str(new.data_type),
-                       bch2_data_type_str(gc.data_type)))
-               new.data_type = gc.data_type;
-
-#define copy_bucket_field(_errtype, _f)                                        \
-       if (fsck_err_on(new._f != gc._f,                                \
-                       trans, _errtype,                                \
-                       "bucket %llu:%llu gen %u data type %s has wrong " #_f   \
-                       ": got %llu, should be %llu",                   \
-                       iter->pos.inode, iter->pos.offset,              \
-                       gc.gen,                                         \
-                       bch2_data_type_str(gc.data_type),               \
-                       (u64) new._f, (u64) gc._f))                             \
-               new._f = gc._f;                                         \
-
-       copy_bucket_field(alloc_key_gen_wrong,                  gen);
-       copy_bucket_field(alloc_key_dirty_sectors_wrong,        dirty_sectors);
-       copy_bucket_field(alloc_key_stripe_sectors_wrong,       stripe_sectors);
-       copy_bucket_field(alloc_key_cached_sectors_wrong,       cached_sectors);
-       copy_bucket_field(alloc_key_stripe_wrong,               stripe);
-       copy_bucket_field(alloc_key_stripe_redundancy_wrong,    stripe_redundancy);
-#undef copy_bucket_field
-
-       if (!bch2_alloc_v4_cmp(*old, new))
-               return 0;
-
-       a = bch2_alloc_to_v4_mut(trans, k);
-       ret = PTR_ERR_OR_ZERO(a);
-       if (ret)
-               return ret;
-
-       a->v = new;
-
-       /*
-        * The trigger normally makes sure these are set, but we're not running
-        * triggers:
-        */
-       if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
-               a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
-
-       ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
-fsck_err:
-       return ret;
-}
-
-static int bch2_gc_alloc_done(struct bch_fs *c)
-{
-       int ret = 0;
-
-       for_each_member_device(c, ca) {
-               ret = bch2_trans_run(c,
-                       for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
-                                       POS(ca->dev_idx, ca->mi.first_bucket),
-                                       POS(ca->dev_idx, ca->mi.nbuckets - 1),
-                                       BTREE_ITER_slots|BTREE_ITER_prefetch, k,
-                                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                               bch2_alloc_write_key(trans, &iter, ca, k)));
-               if (ret) {
-                       bch2_dev_put(ca);
-                       break;
-               }
-       }
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int bch2_gc_alloc_start(struct bch_fs *c)
-{
-       int ret = 0;
-
-       for_each_member_device(c, ca) {
-               ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
-               if (ret) {
-                       bch2_dev_put(ca);
-                       ret = bch_err_throw(c, ENOMEM_gc_alloc_start);
-                       break;
-               }
-       }
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int bch2_gc_write_stripes_key(struct btree_trans *trans,
-                                    struct btree_iter *iter,
-                                    struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       const struct bch_stripe *s;
-       struct gc_stripe *m;
-       bool bad = false;
-       unsigned i;
-       int ret = 0;
-
-       if (k.k->type != KEY_TYPE_stripe)
-               return 0;
-
-       s = bkey_s_c_to_stripe(k).v;
-       m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
-
-       for (i = 0; i < s->nr_blocks; i++) {
-               u32 old = stripe_blockcount_get(s, i);
-               u32 new = (m ? m->block_sectors[i] : 0);
-
-               if (old != new) {
-                       prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
-                                  i, old, new);
-                       bad = true;
-               }
-       }
-
-       if (bad)
-               bch2_bkey_val_to_text(&buf, c, k);
-
-       if (fsck_err_on(bad,
-                       trans, stripe_sector_count_wrong,
-                       "%s", buf.buf)) {
-               struct bkey_i_stripe *new;
-
-               new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-               ret = PTR_ERR_OR_ZERO(new);
-               if (ret)
-                       return ret;
-
-               bkey_reassemble(&new->k_i, k);
-
-               for (i = 0; i < new->v.nr_blocks; i++)
-                       stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
-
-               ret = bch2_trans_update(trans, iter, &new->k_i, 0);
-       }
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int bch2_gc_stripes_done(struct bch_fs *c)
-{
-       return bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter,
-                               BTREE_ID_stripes, POS_MIN,
-                               BTREE_ITER_prefetch, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       bch2_gc_write_stripes_key(trans, &iter, k)));
-}
-
-/**
- * bch2_check_allocations - walk all references to buckets, and recompute them:
- *
- * @c:                 filesystem object
- *
- * Returns: 0 on success, or standard errcode on failure
- *
- * Order matters here:
- *  - Concurrent GC relies on the fact that we have a total ordering for
- *    everything that GC walks - see  gc_will_visit_node(),
- *    gc_will_visit_root()
- *
- *  - also, references move around in the course of index updates and
- *    various other crap: everything needs to agree on the ordering
- *    references are allowed to move around in - e.g., we're allowed to
- *    start with a reference owned by an open_bucket (the allocator) and
- *    move it to the btree, but not the reverse.
- *
- *    This is necessary to ensure that gc doesn't miss references that
- *    move around - if references move backwards in the ordering GC
- *    uses, GC could skip past them
- */
-int bch2_check_allocations(struct bch_fs *c)
-{
-       int ret;
-
-       down_read(&c->state_lock);
-       down_write(&c->gc_lock);
-
-       bch2_btree_interior_updates_flush(c);
-
-       ret   = bch2_gc_accounting_start(c) ?:
-               bch2_gc_start(c) ?:
-               bch2_gc_alloc_start(c) ?:
-               bch2_gc_reflink_start(c);
-       if (ret)
-               goto out;
-
-       gc_pos_set(c, gc_phase(GC_PHASE_start));
-
-       ret = bch2_mark_superblocks(c);
-       bch_err_msg(c, ret, "marking superblocks");
-       if (ret)
-               goto out;
-
-       ret = bch2_gc_btrees(c);
-       if (ret)
-               goto out;
-
-       c->gc_count++;
-
-       ret   = bch2_gc_alloc_done(c) ?:
-               bch2_gc_accounting_done(c) ?:
-               bch2_gc_stripes_done(c) ?:
-               bch2_gc_reflink_done(c);
-out:
-       percpu_down_write(&c->mark_lock);
-       /* Indicates that gc is no longer in progress: */
-       __gc_pos_set(c, gc_phase(GC_PHASE_not_running));
-
-       bch2_gc_free(c);
-       percpu_up_write(&c->mark_lock);
-
-       up_write(&c->gc_lock);
-       up_read(&c->state_lock);
-
-       /*
-        * At startup, allocations can happen directly instead of via the
-        * allocator thread - issue wakeup in case they blocked on gc_lock:
-        */
-       closure_wake_up(&c->freelist_wait);
-
-       if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
-               bch2_sb_members_clean_deleted(c);
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int gc_btree_gens_key(struct btree_trans *trans,
-                            struct btree_iter *iter,
-                            struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-       if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
-               return -EROFS;
-
-       bool too_stale = false;
-       scoped_guard(rcu) {
-               bkey_for_each_ptr(ptrs, ptr) {
-                       struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-                       if (!ca)
-                               continue;
-
-                       too_stale |= dev_ptr_stale(ca, ptr) > 16;
-               }
-
-               if (!too_stale)
-                       bkey_for_each_ptr(ptrs, ptr) {
-                               struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-                               if (!ca)
-                                       continue;
-
-                               u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
-                               if (gen_after(*gen, ptr->gen))
-                                       *gen = ptr->gen;
-                       }
-       }
-
-       if (too_stale) {
-               struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
-               int ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       return ret;
-
-               bch2_extent_normalize(c, bkey_i_to_s(u));
-       }
-
-       return 0;
-}
-
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
-                                      struct btree_iter *iter, struct bkey_s_c k)
-{
-       struct bch_alloc_v4 a_convert;
-       const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-       struct bkey_i_alloc_v4 *a_mut;
-       int ret;
-
-       if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
-               return 0;
-
-       a_mut = bch2_alloc_to_v4_mut(trans, k);
-       ret = PTR_ERR_OR_ZERO(a_mut);
-       if (ret)
-               return ret;
-
-       a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
-
-       return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
-}
-
-int bch2_gc_gens(struct bch_fs *c)
-{
-       u64 b, start_time = local_clock();
-       int ret;
-
-       if (!mutex_trylock(&c->gc_gens_lock))
-               return 0;
-
-       trace_and_count(c, gc_gens_start, c);
-
-       /*
-        * We have to use trylock here. Otherwise, we would
-        * introduce a deadlock in the RO path - we take the
-        * state lock at the start of going RO.
-        */
-       if (!down_read_trylock(&c->state_lock)) {
-               mutex_unlock(&c->gc_gens_lock);
-               return 0;
-       }
-
-       for_each_member_device(c, ca) {
-               struct bucket_gens *gens = bucket_gens(ca);
-
-               BUG_ON(ca->oldest_gen);
-
-               ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
-               if (!ca->oldest_gen) {
-                       bch2_dev_put(ca);
-                       ret = bch_err_throw(c, ENOMEM_gc_gens);
-                       goto err;
-               }
-
-               for (b = gens->first_bucket;
-                    b < gens->nbuckets; b++)
-                       ca->oldest_gen[b] = gens->b[b];
-       }
-
-       for (unsigned i = 0; i < BTREE_ID_NR; i++)
-               if (btree_type_has_ptrs(i)) {
-                       c->gc_gens_btree = i;
-                       c->gc_gens_pos = POS_MIN;
-
-                       ret = bch2_trans_run(c,
-                               for_each_btree_key_commit(trans, iter, i,
-                                               POS_MIN,
-                                               BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
-                                               k,
-                                               NULL, NULL,
-                                               BCH_TRANS_COMMIT_no_enospc,
-                                       gc_btree_gens_key(trans, &iter, k)));
-                       if (ret)
-                               goto err;
-               }
-
-       struct bch_dev *ca = NULL;
-       ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
-                               POS_MIN,
-                               BTREE_ITER_prefetch,
-                               k,
-                               NULL, NULL,
-                               BCH_TRANS_COMMIT_no_enospc, ({
-                       ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-                       if (!ca) {
-                               bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
-                               continue;
-                       }
-                       bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
-               })));
-       bch2_dev_put(ca);
-
-       if (ret)
-               goto err;
-
-       c->gc_gens_btree        = 0;
-       c->gc_gens_pos          = POS_MIN;
-
-       c->gc_count++;
-
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
-       trace_and_count(c, gc_gens_end, c);
-err:
-       for_each_member_device(c, ca) {
-               kvfree(ca->oldest_gen);
-               ca->oldest_gen = NULL;
-       }
-
-       up_read(&c->state_lock);
-       mutex_unlock(&c->gc_gens_lock);
-       if (!bch2_err_matches(ret, EROFS))
-               bch_err_fn(c, ret);
-       return ret;
-}
-
-static void bch2_gc_gens_work(struct work_struct *work)
-{
-       struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
-       bch2_gc_gens(c);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
-}
-
-void bch2_gc_gens_async(struct bch_fs *c)
-{
-       if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) &&
-           !queue_work(c->write_ref_wq, &c->gc_gens_work))
-               enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
-}
-
-void bch2_fs_btree_gc_init_early(struct bch_fs *c)
-{
-       seqcount_init(&c->gc_pos_lock);
-       INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
-
-       init_rwsem(&c->gc_lock);
-       mutex_init(&c->gc_gens_lock);
-}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
deleted file mode 100644 (file)
index ec77662..0000000
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_GC_H
-#define _BCACHEFS_BTREE_GC_H
-
-#include "bkey.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-
-int bch2_check_topology(struct bch_fs *);
-int bch2_check_allocations(struct bch_fs *);
-
-/*
- * For concurrent mark and sweep (with other index updates), we define a total
- * ordering of _all_ references GC walks:
- *
- * Note that some references will have the same GC position as others - e.g.
- * everything within the same btree node; in those cases we're relying on
- * whatever locking exists for where those references live, i.e. the write lock
- * on a btree node.
- *
- * That locking is also required to ensure GC doesn't pass the updater in
- * between the updater adding/removing the reference and updating the GC marks;
- * without that, we would at best double count sometimes.
- *
- * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
- * be held that prevents GC from passing the position the updater is at.
- *
- * (What about the start of gc, when we're clearing all the marks? GC clears the
- * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
- * position inside its cmpxchg loop, so crap magically works).
- */
-
-/* Position of (the start of) a gc phase: */
-static inline struct gc_pos gc_phase(enum gc_phase phase)
-{
-       return (struct gc_pos) { .phase = phase, };
-}
-
-static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
-                                        struct bpos pos)
-{
-       return (struct gc_pos) {
-               .phase  = GC_PHASE_btree,
-               .btree  = btree,
-               .level  = level,
-               .pos    = pos,
-       };
-}
-
-static inline int gc_btree_order(enum btree_id btree)
-{
-       if (btree == BTREE_ID_alloc)
-               return -2;
-       if (btree == BTREE_ID_stripes)
-               return -1;
-       return btree;
-}
-
-static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
-{
-       return  cmp_int(l.phase, r.phase) ?:
-               cmp_int(gc_btree_order(l.btree),
-                       gc_btree_order(r.btree)) ?:
-               cmp_int(l.level, r.level) ?:
-               bpos_cmp(l.pos, r.pos);
-}
-
-static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
-{
-       unsigned seq;
-       bool ret;
-
-       do {
-               seq = read_seqcount_begin(&c->gc_pos_lock);
-               ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
-       } while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
-       return ret;
-}
-
-void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
-
-int bch2_gc_gens(struct bch_fs *);
-void bch2_gc_gens_async(struct bch_fs *);
-
-void bch2_fs_btree_gc_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h
deleted file mode 100644 (file)
index c24dd6e..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_GC_TYPES_H
-#define _BCACHEFS_BTREE_GC_TYPES_H
-
-#include <linux/generic-radix-tree.h>
-
-#define GC_PHASES()            \
-       x(not_running)          \
-       x(start)                \
-       x(sb)                   \
-       x(btree)
-
-enum gc_phase {
-#define x(n)   GC_PHASE_##n,
-       GC_PHASES()
-#undef x
-};
-
-struct gc_pos {
-       enum gc_phase           phase:8;
-       enum btree_id           btree:8;
-       u16                     level;
-       struct bpos             pos;
-};
-
-struct reflink_gc {
-       u64             offset;
-       u32             size;
-       u32             refcount;
-};
-
-typedef GENRADIX(struct reflink_gc) reflink_gc_table;
-
-#endif /* _BCACHEFS_BTREE_GC_TYPES_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
deleted file mode 100644 (file)
index 590cd29..0000000
+++ /dev/null
@@ -1,2742 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "async_objs.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "bkey_sort.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "debug.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "recovery.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
-{
-       bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
-       prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn));
-       prt_str(out, "min: ");
-       bch2_bpos_to_text(out, bn->min_key);
-       prt_newline(out);
-       prt_str(out, "max: ");
-       bch2_bpos_to_text(out, bn->max_key);
-}
-
-void bch2_btree_node_io_unlock(struct btree *b)
-{
-       EBUG_ON(!btree_node_write_in_flight(b));
-
-       clear_btree_node_write_in_flight_inner(b);
-       clear_btree_node_write_in_flight(b);
-       smp_mb__after_atomic();
-       wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-void bch2_btree_node_io_lock(struct btree *b)
-{
-       wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
-                           TASK_UNINTERRUPTIBLE);
-}
-
-void __bch2_btree_node_wait_on_read(struct btree *b)
-{
-       wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-                      TASK_UNINTERRUPTIBLE);
-}
-
-void __bch2_btree_node_wait_on_write(struct btree *b)
-{
-       wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
-                      TASK_UNINTERRUPTIBLE);
-}
-
-void bch2_btree_node_wait_on_read(struct btree *b)
-{
-       wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-                      TASK_UNINTERRUPTIBLE);
-}
-
-void bch2_btree_node_wait_on_write(struct btree *b)
-{
-       wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
-                      TASK_UNINTERRUPTIBLE);
-}
-
-static void verify_no_dups(struct btree *b,
-                          struct bkey_packed *start,
-                          struct bkey_packed *end)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       struct bkey_packed *k, *p;
-
-       if (start == end)
-               return;
-
-       for (p = start, k = bkey_p_next(start);
-            k != end;
-            p = k, k = bkey_p_next(k)) {
-               struct bkey l = bkey_unpack_key(b, p);
-               struct bkey r = bkey_unpack_key(b, k);
-
-               BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
-       }
-#endif
-}
-
-static void set_needs_whiteout(struct bset *i, int v)
-{
-       struct bkey_packed *k;
-
-       for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
-               k->needs_whiteout = v;
-}
-
-static void btree_bounce_free(struct bch_fs *c, size_t size,
-                             bool used_mempool, void *p)
-{
-       if (used_mempool)
-               mempool_free(p, &c->btree_bounce_pool);
-       else
-               kvfree(p);
-}
-
-static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
-                               bool *used_mempool)
-{
-       unsigned flags = memalloc_nofs_save();
-       void *p;
-
-       BUG_ON(size > c->opts.btree_node_size);
-
-       *used_mempool = false;
-       p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
-       if (!p) {
-               *used_mempool = true;
-               p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
-       }
-       memalloc_nofs_restore(flags);
-       return p;
-}
-
-static void sort_bkey_ptrs(const struct btree *bt,
-                          struct bkey_packed **ptrs, unsigned nr)
-{
-       unsigned n = nr, a = nr / 2, b, c, d;
-
-       if (!a)
-               return;
-
-       /* Heap sort: see lib/sort.c: */
-       while (1) {
-               if (a)
-                       a--;
-               else if (--n)
-                       swap(ptrs[0], ptrs[n]);
-               else
-                       break;
-
-               for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
-                       b = bch2_bkey_cmp_packed(bt,
-                                           ptrs[c],
-                                           ptrs[d]) >= 0 ? c : d;
-               if (d == n)
-                       b = c;
-
-               while (b != a &&
-                      bch2_bkey_cmp_packed(bt,
-                                      ptrs[a],
-                                      ptrs[b]) >= 0)
-                       b = (b - 1) / 2;
-               c = b;
-               while (b != a) {
-                       b = (b - 1) / 2;
-                       swap(ptrs[b], ptrs[c]);
-               }
-       }
-}
-
-static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
-{
-       struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
-       bool used_mempool = false;
-       size_t bytes = b->whiteout_u64s * sizeof(u64);
-
-       if (!b->whiteout_u64s)
-               return;
-
-       new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
-
-       ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
-
-       for (k = unwritten_whiteouts_start(b);
-            k != unwritten_whiteouts_end(b);
-            k = bkey_p_next(k))
-               *--ptrs = k;
-
-       sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
-
-       k = new_whiteouts;
-
-       while (ptrs != ptrs_end) {
-               bkey_p_copy(k, *ptrs);
-               k = bkey_p_next(k);
-               ptrs++;
-       }
-
-       verify_no_dups(b, new_whiteouts,
-                      (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
-
-       memcpy_u64s(unwritten_whiteouts_start(b),
-                   new_whiteouts, b->whiteout_u64s);
-
-       btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
-}
-
-static bool should_compact_bset(struct btree *b, struct bset_tree *t,
-                               bool compacting, enum compact_mode mode)
-{
-       if (!bset_dead_u64s(b, t))
-               return false;
-
-       switch (mode) {
-       case COMPACT_LAZY:
-               return should_compact_bset_lazy(b, t) ||
-                       (compacting && !bset_written(b, bset(b, t)));
-       case COMPACT_ALL:
-               return true;
-       default:
-               BUG();
-       }
-}
-
-static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
-{
-       bool ret = false;
-
-       for_each_bset(b, t) {
-               struct bset *i = bset(b, t);
-               struct bkey_packed *k, *n, *out, *start, *end;
-               struct btree_node_entry *src = NULL, *dst = NULL;
-
-               if (t != b->set && !bset_written(b, i)) {
-                       src = container_of(i, struct btree_node_entry, keys);
-                       dst = max(write_block(b),
-                                 (void *) btree_bkey_last(b, t - 1));
-               }
-
-               if (src != dst)
-                       ret = true;
-
-               if (!should_compact_bset(b, t, ret, mode)) {
-                       if (src != dst) {
-                               memmove(dst, src, sizeof(*src) +
-                                       le16_to_cpu(src->keys.u64s) *
-                                       sizeof(u64));
-                               i = &dst->keys;
-                               set_btree_bset(b, t, i);
-                       }
-                       continue;
-               }
-
-               start   = btree_bkey_first(b, t);
-               end     = btree_bkey_last(b, t);
-
-               if (src != dst) {
-                       memmove(dst, src, sizeof(*src));
-                       i = &dst->keys;
-                       set_btree_bset(b, t, i);
-               }
-
-               out = i->start;
-
-               for (k = start; k != end; k = n) {
-                       n = bkey_p_next(k);
-
-                       if (!bkey_deleted(k)) {
-                               bkey_p_copy(out, k);
-                               out = bkey_p_next(out);
-                       } else {
-                               BUG_ON(k->needs_whiteout);
-                       }
-               }
-
-               i->u64s = cpu_to_le16((u64 *) out - i->_data);
-               set_btree_bset_end(b, t);
-               bch2_bset_set_no_aux_tree(b, t);
-               ret = true;
-       }
-
-       bch2_verify_btree_nr_keys(b);
-
-       bch2_btree_build_aux_trees(b);
-
-       return ret;
-}
-
-bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
-                           enum compact_mode mode)
-{
-       return bch2_drop_whiteouts(b, mode);
-}
-
-static void btree_node_sort(struct bch_fs *c, struct btree *b,
-                           unsigned start_idx,
-                           unsigned end_idx)
-{
-       struct btree_node *out;
-       struct sort_iter_stack sort_iter;
-       struct bset_tree *t;
-       struct bset *start_bset = bset(b, &b->set[start_idx]);
-       bool used_mempool = false;
-       u64 start_time, seq = 0;
-       unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
-       bool sorting_entire_node = start_idx == 0 &&
-               end_idx == b->nsets;
-
-       sort_iter_stack_init(&sort_iter, b);
-
-       for (t = b->set + start_idx;
-            t < b->set + end_idx;
-            t++) {
-               u64s += le16_to_cpu(bset(b, t)->u64s);
-               sort_iter_add(&sort_iter.iter,
-                             btree_bkey_first(b, t),
-                             btree_bkey_last(b, t));
-       }
-
-       bytes = sorting_entire_node
-               ? btree_buf_bytes(b)
-               : __vstruct_bytes(struct btree_node, u64s);
-
-       out = btree_bounce_alloc(c, bytes, &used_mempool);
-
-       start_time = local_clock();
-
-       u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
-
-       out->keys.u64s = cpu_to_le16(u64s);
-
-       BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
-
-       if (sorting_entire_node)
-               bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
-                                      start_time);
-
-       /* Make sure we preserve bset journal_seq: */
-       for (t = b->set + start_idx; t < b->set + end_idx; t++)
-               seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
-       start_bset->journal_seq = cpu_to_le64(seq);
-
-       if (sorting_entire_node) {
-               u64s = le16_to_cpu(out->keys.u64s);
-
-               BUG_ON(bytes != btree_buf_bytes(b));
-
-               /*
-                * Our temporary buffer is the same size as the btree node's
-                * buffer, we can just swap buffers instead of doing a big
-                * memcpy()
-                */
-               *out = *b->data;
-               out->keys.u64s = cpu_to_le16(u64s);
-               swap(out, b->data);
-               set_btree_bset(b, b->set, &b->data->keys);
-       } else {
-               start_bset->u64s = out->keys.u64s;
-               memcpy_u64s(start_bset->start,
-                           out->keys.start,
-                           le16_to_cpu(out->keys.u64s));
-       }
-
-       for (i = start_idx + 1; i < end_idx; i++)
-               b->nr.bset_u64s[start_idx] +=
-                       b->nr.bset_u64s[i];
-
-       b->nsets -= shift;
-
-       for (i = start_idx + 1; i < b->nsets; i++) {
-               b->nr.bset_u64s[i]      = b->nr.bset_u64s[i + shift];
-               b->set[i]               = b->set[i + shift];
-       }
-
-       for (i = b->nsets; i < MAX_BSETS; i++)
-               b->nr.bset_u64s[i] = 0;
-
-       set_btree_bset_end(b, &b->set[start_idx]);
-       bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
-
-       btree_bounce_free(c, bytes, used_mempool, out);
-
-       bch2_verify_btree_nr_keys(b);
-}
-
-void bch2_btree_sort_into(struct bch_fs *c,
-                        struct btree *dst,
-                        struct btree *src)
-{
-       struct btree_nr_keys nr;
-       struct btree_node_iter src_iter;
-       u64 start_time = local_clock();
-
-       BUG_ON(dst->nsets != 1);
-
-       bch2_bset_set_no_aux_tree(dst, dst->set);
-
-       bch2_btree_node_iter_init_from_start(&src_iter, src);
-
-       nr = bch2_sort_repack(btree_bset_first(dst),
-                       src, &src_iter,
-                       &dst->format,
-                       true);
-
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
-                              start_time);
-
-       set_btree_bset_end(dst, dst->set);
-
-       dst->nr.live_u64s       += nr.live_u64s;
-       dst->nr.bset_u64s[0]    += nr.bset_u64s[0];
-       dst->nr.packed_keys     += nr.packed_keys;
-       dst->nr.unpacked_keys   += nr.unpacked_keys;
-
-       bch2_verify_btree_nr_keys(dst);
-}
-
-/*
- * We're about to add another bset to the btree node, so if there's currently
- * too many bsets - sort some of them together:
- */
-static bool btree_node_compact(struct bch_fs *c, struct btree *b)
-{
-       unsigned unwritten_idx;
-       bool ret = false;
-
-       for (unwritten_idx = 0;
-            unwritten_idx < b->nsets;
-            unwritten_idx++)
-               if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
-                       break;
-
-       if (b->nsets - unwritten_idx > 1) {
-               btree_node_sort(c, b, unwritten_idx, b->nsets);
-               ret = true;
-       }
-
-       if (unwritten_idx > 1) {
-               btree_node_sort(c, b, 0, unwritten_idx);
-               ret = true;
-       }
-
-       return ret;
-}
-
-void bch2_btree_build_aux_trees(struct btree *b)
-{
-       for_each_bset(b, t)
-               bch2_bset_build_aux_tree(b, t,
-                               !bset_written(b, bset(b, t)) &&
-                               t == bset_tree_last(b));
-}
-
-/*
- * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
- *
- * The first bset is going to be of similar order to the size of the node, the
- * last bset is bounded by btree_write_set_buffer(), which is set to keep the
- * memmove on insert from being too expensive: the middle bset should, ideally,
- * be the geometric mean of the first and the last.
- *
- * Returns true if the middle bset is greater than that geometric mean:
- */
-static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
-{
-       unsigned mid_u64s_bits =
-               (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
-
-       return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
-}
-
-/*
- * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
- * inserted into
- *
- * Safe to call if there already is an unwritten bset - will only add a new bset
- * if @b doesn't already have one.
- *
- * Returns true if we sorted (i.e. invalidated iterators
- */
-void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_node_entry *bne;
-       bool reinit_iter = false;
-
-       EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
-       BUG_ON(bset_written(b, bset(b, &b->set[1])));
-       BUG_ON(btree_node_just_written(b));
-
-       if (b->nsets == MAX_BSETS &&
-           !btree_node_write_in_flight(b) &&
-           should_compact_all(c, b)) {
-               bch2_btree_node_write_trans(trans, b, SIX_LOCK_write,
-                                           BTREE_WRITE_init_next_bset);
-               reinit_iter = true;
-       }
-
-       if (b->nsets == MAX_BSETS &&
-           btree_node_compact(c, b))
-               reinit_iter = true;
-
-       BUG_ON(b->nsets >= MAX_BSETS);
-
-       bne = want_new_bset(c, b);
-       if (bne)
-               bch2_bset_init_next(b, bne);
-
-       bch2_btree_build_aux_trees(b);
-
-       if (reinit_iter)
-               bch2_trans_node_reinit_iter(trans, b);
-}
-
-static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
-                         struct bch_dev *ca,
-                         bool print_pos,
-                         struct btree *b, struct bset *i, struct bkey_packed *k,
-                         unsigned offset, int rw)
-{
-       if (print_pos) {
-               prt_str(out, rw == READ
-                       ? "error validating btree node "
-                       : "corrupt btree node before write ");
-               prt_printf(out, "at btree ");
-               bch2_btree_pos_to_text(out, c, b);
-               prt_newline(out);
-       }
-
-       if (ca)
-               prt_printf(out, "%s ", ca->name);
-
-       prt_printf(out, "node offset %u/%u",
-                  b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
-       if (i)
-               prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
-       if (k)
-               prt_printf(out, " bset byte offset %lu",
-                          (unsigned long)(void *)k -
-                          ((unsigned long)(void *)i & ~511UL));
-       prt_str(out, ": ");
-}
-
-__printf(11, 12)
-static int __btree_err(int ret,
-                      struct bch_fs *c,
-                      struct bch_dev *ca,
-                      struct btree *b,
-                      struct bset *i,
-                      struct bkey_packed *k,
-                      int rw,
-                      enum bch_sb_error_id err_type,
-                      struct bch_io_failures *failed,
-                      struct printbuf *err_msg,
-                      const char *fmt, ...)
-{
-       if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
-               return ret == -BCH_ERR_btree_node_read_err_fixable
-                       ? bch_err_throw(c, fsck_fix)
-                       : ret;
-
-       bool have_retry = false;
-       int ret2;
-
-       if (ca) {
-               bch2_mark_btree_validate_failure(failed, ca->dev_idx);
-
-               struct extent_ptr_decoded pick;
-               have_retry = bch2_bkey_pick_read_device(c,
-                                       bkey_i_to_s_c(&b->key),
-                                       failed, &pick, -1) == 1;
-       }
-
-       if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
-               ret = bch_err_throw(c, btree_node_read_err_fixable);
-       if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
-               ret = bch_err_throw(c, btree_node_read_err_bad_node);
-
-       bch2_sb_error_count(c, err_type);
-
-       bool print_deferred = err_msg &&
-               rw == READ &&
-               !(test_bit(BCH_FS_in_fsck, &c->flags) &&
-                 c->opts.fix_errors == FSCK_FIX_ask);
-
-       struct printbuf out = PRINTBUF;
-       bch2_log_msg_start(c, &out);
-
-       if (!print_deferred)
-               err_msg = &out;
-
-       btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw);
-
-       va_list args;
-       va_start(args, fmt);
-       prt_vprintf(err_msg, fmt, args);
-       va_end(args);
-
-       if (print_deferred) {
-               prt_newline(err_msg);
-
-               switch (ret) {
-               case -BCH_ERR_btree_node_read_err_fixable:
-                       ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type);
-                       if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
-                           !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
-                               ret = ret2;
-                               goto fsck_err;
-                       }
-
-                       if (!have_retry)
-                               ret = bch_err_throw(c, fsck_fix);
-                       goto out;
-               case -BCH_ERR_btree_node_read_err_bad_node:
-                       prt_str(&out, ", ");
-                       break;
-               }
-
-               goto out;
-       }
-
-       if (rw == WRITE) {
-               prt_str(&out, ", ");
-               ret = __bch2_inconsistent_error(c, &out)
-                       ? -BCH_ERR_fsck_errors_not_fixed
-                       : 0;
-               goto print;
-       }
-
-       switch (ret) {
-       case -BCH_ERR_btree_node_read_err_fixable:
-               ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf);
-               if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
-                   !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
-                       ret = ret2;
-                       goto fsck_err;
-               }
-
-               if (!have_retry)
-                       ret = bch_err_throw(c, fsck_fix);
-               goto out;
-       case -BCH_ERR_btree_node_read_err_bad_node:
-               prt_str(&out, ", ");
-               break;
-       }
-print:
-       bch2_print_str(c, KERN_ERR, out.buf);
-out:
-fsck_err:
-       printbuf_exit(&out);
-       return ret;
-}
-
-#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...)           \
-({                                                                     \
-       int _ret = __btree_err(type, c, ca, b, i, k, write,             \
-                              BCH_FSCK_ERR_##_err_type,                \
-                              failed, err_msg,                         \
-                              msg, ##__VA_ARGS__);                     \
-                                                                       \
-       if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) {                \
-               ret = _ret;                                             \
-               goto fsck_err;                                          \
-       }                                                               \
-                                                                       \
-       true;                                                           \
-})
-
-#define btree_err_on(cond, ...)        ((cond) ? btree_err(__VA_ARGS__) : false)
-
-/*
- * When btree topology repair changes the start or end of a node, that might
- * mean we have to drop keys that are no longer inside the node:
- */
-__cold
-void bch2_btree_node_drop_keys_outside_node(struct btree *b)
-{
-       for_each_bset(b, t) {
-               struct bset *i = bset(b, t);
-               struct bkey_packed *k;
-
-               for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
-                       if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
-                               break;
-
-               if (k != i->start) {
-                       unsigned shift = (u64 *) k - (u64 *) i->start;
-
-                       memmove_u64s_down(i->start, k,
-                                         (u64 *) vstruct_end(i) - (u64 *) k);
-                       i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
-                       set_btree_bset_end(b, t);
-               }
-
-               for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
-                       if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
-                               break;
-
-               if (k != vstruct_last(i)) {
-                       i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
-                       set_btree_bset_end(b, t);
-               }
-       }
-
-       /*
-        * Always rebuild search trees: eytzinger search tree nodes directly
-        * depend on the values of min/max key:
-        */
-       bch2_bset_set_no_aux_tree(b, b->set);
-       bch2_btree_build_aux_trees(b);
-       b->nr = bch2_btree_node_count_keys(b);
-
-       struct bkey_s_c k;
-       struct bkey unpacked;
-       struct btree_node_iter iter;
-       for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
-               BUG_ON(bpos_lt(k.k->p, b->data->min_key));
-               BUG_ON(bpos_gt(k.k->p, b->data->max_key));
-       }
-}
-
-static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
-                        struct btree *b, struct bset *i,
-                        unsigned offset, int write,
-                        struct bch_io_failures *failed,
-                        struct printbuf *err_msg)
-{
-       unsigned version = le16_to_cpu(i->version);
-       struct printbuf buf1 = PRINTBUF;
-       struct printbuf buf2 = PRINTBUF;
-       int ret = 0;
-
-       btree_err_on(!bch2_version_compatible(version),
-                    -BCH_ERR_btree_node_read_err_incompatible,
-                    c, ca, b, i, NULL,
-                    btree_node_unsupported_version,
-                    "unsupported bset version %u.%u",
-                    BCH_VERSION_MAJOR(version),
-                    BCH_VERSION_MINOR(version));
-
-       if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes &&
-           btree_err_on(version < c->sb.version_min,
-                        -BCH_ERR_btree_node_read_err_fixable,
-                        c, NULL, b, i, NULL,
-                        btree_node_bset_older_than_sb_min,
-                        "bset version %u older than superblock version_min %u",
-                        version, c->sb.version_min)) {
-               if (bch2_version_compatible(version)) {
-                       mutex_lock(&c->sb_lock);
-                       c->disk_sb.sb->version_min = cpu_to_le16(version);
-                       bch2_write_super(c);
-                       mutex_unlock(&c->sb_lock);
-               } else {
-                       /* We have no idea what's going on: */
-                       i->version = cpu_to_le16(c->sb.version);
-               }
-       }
-
-       if (btree_err_on(BCH_VERSION_MAJOR(version) >
-                        BCH_VERSION_MAJOR(c->sb.version),
-                        -BCH_ERR_btree_node_read_err_fixable,
-                        c, NULL, b, i, NULL,
-                        btree_node_bset_newer_than_sb,
-                        "bset version %u newer than superblock version %u",
-                        version, c->sb.version)) {
-               mutex_lock(&c->sb_lock);
-               c->disk_sb.sb->version = cpu_to_le16(version);
-               bch2_write_super(c);
-               mutex_unlock(&c->sb_lock);
-       }
-
-       btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-                    -BCH_ERR_btree_node_read_err_incompatible,
-                    c, ca, b, i, NULL,
-                    btree_node_unsupported_version,
-                    "BSET_SEPARATE_WHITEOUTS no longer supported");
-
-       btree_err_on(offset && !i->u64s,
-                    -BCH_ERR_btree_node_read_err_fixable,
-                    c, ca, b, i, NULL,
-                    bset_empty,
-                    "empty bset");
-
-       btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
-                    -BCH_ERR_btree_node_read_err_want_retry,
-                    c, ca, b, i, NULL,
-                    bset_wrong_sector_offset,
-                    "bset at wrong sector offset");
-
-       if (!offset) {
-               struct btree_node *bn =
-                       container_of(i, struct btree_node, keys);
-               /* These indicate that we read the wrong btree node: */
-
-               if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-                       struct bch_btree_ptr_v2 *bp =
-                               &bkey_i_to_btree_ptr_v2(&b->key)->v;
-
-                       /* XXX endianness */
-                       btree_err_on(bp->seq != bn->keys.seq,
-                                    -BCH_ERR_btree_node_read_err_must_retry,
-                                    c, ca, b, NULL, NULL,
-                                    bset_bad_seq,
-                                    "incorrect sequence number (wrong btree node)");
-               }
-
-               btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-                            -BCH_ERR_btree_node_read_err_must_retry,
-                            c, ca, b, i, NULL,
-                            btree_node_bad_btree,
-                            "incorrect btree id");
-
-               btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-                            -BCH_ERR_btree_node_read_err_must_retry,
-                            c, ca, b, i, NULL,
-                            btree_node_bad_level,
-                            "incorrect level");
-
-               if (!write)
-                       compat_btree_node(b->c.level, b->c.btree_id, version,
-                                         BSET_BIG_ENDIAN(i), write, bn);
-
-               if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-                       struct bch_btree_ptr_v2 *bp =
-                               &bkey_i_to_btree_ptr_v2(&b->key)->v;
-
-                       if (BTREE_PTR_RANGE_UPDATED(bp)) {
-                               b->data->min_key = bp->min_key;
-                               b->data->max_key = b->key.k.p;
-                       }
-
-                       btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-                                    -BCH_ERR_btree_node_read_err_must_retry,
-                                    c, ca, b, NULL, NULL,
-                                    btree_node_bad_min_key,
-                                    "incorrect min_key: got %s should be %s",
-                                    (printbuf_reset(&buf1),
-                                     bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
-                                    (printbuf_reset(&buf2),
-                                     bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
-               }
-
-               btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-                            -BCH_ERR_btree_node_read_err_must_retry,
-                            c, ca, b, i, NULL,
-                            btree_node_bad_max_key,
-                            "incorrect max key %s",
-                            (printbuf_reset(&buf1),
-                             bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
-
-               if (write)
-                       compat_btree_node(b->c.level, b->c.btree_id, version,
-                                         BSET_BIG_ENDIAN(i), write, bn);
-
-               btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
-                            -BCH_ERR_btree_node_read_err_bad_node,
-                            c, ca, b, i, NULL,
-                            btree_node_bad_format,
-                            "invalid bkey format: %s\n%s", buf1.buf,
-                            (printbuf_reset(&buf2),
-                             bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
-               printbuf_reset(&buf1);
-
-               compat_bformat(b->c.level, b->c.btree_id, version,
-                              BSET_BIG_ENDIAN(i), write,
-                              &bn->format);
-       }
-fsck_err:
-       printbuf_exit(&buf2);
-       printbuf_exit(&buf1);
-       return ret;
-}
-
-static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
-                                       struct bkey_s_c k,
-                                       enum bch_validate_flags flags)
-{
-       return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
-               .from   = BKEY_VALIDATE_btree_node,
-               .level  = b->c.level,
-               .btree  = b->c.btree_id,
-               .flags  = flags
-       });
-}
-
-static int bset_key_validate(struct bch_fs *c, struct btree *b,
-                            struct bkey_s_c k,
-                            bool updated_range,
-                            enum bch_validate_flags flags)
-{
-       struct bkey_validate_context from = (struct bkey_validate_context) {
-               .from   = BKEY_VALIDATE_btree_node,
-               .level  = b->c.level,
-               .btree  = b->c.btree_id,
-               .flags  = flags,
-       };
-       return __bch2_bkey_validate(c, k, from) ?:
-               (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
-               (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
-}
-
-static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
-                        struct bset *i, struct bkey_packed *k)
-{
-       if (bkey_p_next(k) > vstruct_last(i))
-               return false;
-
-       if (k->format > KEY_FORMAT_CURRENT)
-               return false;
-
-       if (!bkeyp_u64s_valid(&b->format, k))
-               return false;
-
-       struct bkey tmp;
-       struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-       return !__bch2_bkey_validate(c, u.s_c,
-                                    (struct bkey_validate_context) {
-                                       .from   = BKEY_VALIDATE_btree_node,
-                                       .level  = b->c.level,
-                                       .btree  = b->c.btree_id,
-                                       .flags  = BCH_VALIDATE_silent
-                                    });
-}
-
-static inline int btree_node_read_bkey_cmp(const struct btree *b,
-                               const struct bkey_packed *l,
-                               const struct bkey_packed *r)
-{
-       return bch2_bkey_cmp_packed(b, l, r)
-               ?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
-}
-
-static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-                        struct bset *i, int write,
-                        struct bch_io_failures *failed,
-                        struct printbuf *err_msg)
-{
-       unsigned version = le16_to_cpu(i->version);
-       struct bkey_packed *k, *prev = NULL;
-       struct printbuf buf = PRINTBUF;
-       bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-               BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
-       int ret = 0;
-
-       for (k = i->start;
-            k != vstruct_last(i);) {
-               struct bkey_s u;
-               struct bkey tmp;
-               unsigned next_good_key;
-
-               if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-                                -BCH_ERR_btree_node_read_err_fixable,
-                                c, NULL, b, i, k,
-                                btree_node_bkey_past_bset_end,
-                                "key extends past end of bset")) {
-                       i->u64s = cpu_to_le16((u64 *) k - i->_data);
-                       break;
-               }
-
-               if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-                                -BCH_ERR_btree_node_read_err_fixable,
-                                c, NULL, b, i, k,
-                                btree_node_bkey_bad_format,
-                                "invalid bkey format %u", k->format))
-                       goto drop_this_key;
-
-               if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
-                                -BCH_ERR_btree_node_read_err_fixable,
-                                c, NULL, b, i, k,
-                                btree_node_bkey_bad_u64s,
-                                "bad k->u64s %u (min %u max %zu)", k->u64s,
-                                bkeyp_key_u64s(&b->format, k),
-                                U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
-                       goto drop_this_key;
-
-               if (!write)
-                       bch2_bkey_compat(b->c.level, b->c.btree_id, version,
-                                   BSET_BIG_ENDIAN(i), write,
-                                   &b->format, k);
-
-               u = __bkey_disassemble(b, k, &tmp);
-
-               ret = bset_key_validate(c, b, u.s_c, updated_range, write);
-               if (ret == -BCH_ERR_fsck_delete_bkey)
-                       goto drop_this_key;
-               if (ret)
-                       goto fsck_err;
-
-               if (write)
-                       bch2_bkey_compat(b->c.level, b->c.btree_id, version,
-                                   BSET_BIG_ENDIAN(i), write,
-                                   &b->format, k);
-
-               if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
-                       struct bkey up = bkey_unpack_key(b, prev);
-
-                       printbuf_reset(&buf);
-                       prt_printf(&buf, "keys out of order: ");
-                       bch2_bkey_to_text(&buf, &up);
-                       prt_printf(&buf, " > ");
-                       bch2_bkey_to_text(&buf, u.k);
-
-                       if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
-                                     c, NULL, b, i, k,
-                                     btree_node_bkey_out_of_order,
-                                     "%s", buf.buf))
-                               goto drop_this_key;
-               }
-
-               prev = k;
-               k = bkey_p_next(k);
-               continue;
-drop_this_key:
-               next_good_key = k->u64s;
-
-               if (!next_good_key ||
-                   (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
-                    version >= bcachefs_metadata_version_snapshot)) {
-                       /*
-                        * only do scanning if bch2_bkey_compat() has nothing to
-                        * do
-                        */
-
-                       if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
-                               for (next_good_key = 1;
-                                    next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
-                                    next_good_key++)
-                                       if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
-                                               goto got_good_key;
-                       }
-
-                       /*
-                        * didn't find a good key, have to truncate the rest of
-                        * the bset
-                        */
-                       next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
-               }
-got_good_key:
-               le16_add_cpu(&i->u64s, -next_good_key);
-               memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
-               set_btree_node_need_rewrite(b);
-               set_btree_node_need_rewrite_error(b);
-       }
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-                             struct btree *b,
-                             struct bch_io_failures *failed,
-                             struct printbuf *err_msg)
-{
-       struct btree_node_entry *bne;
-       struct sort_iter *iter;
-       struct btree_node *sorted;
-       struct bkey_packed *k;
-       struct bset *i;
-       bool used_mempool, blacklisted;
-       bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-               BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
-       unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
-       u64 max_journal_seq = 0;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0, write = READ;
-       u64 start_time = local_clock();
-
-       b->version_ondisk = U16_MAX;
-       /* We might get called multiple times on read retry: */
-       b->written = 0;
-
-       iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
-       sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
-
-       if (bch2_meta_read_fault("btree"))
-               btree_err(-BCH_ERR_btree_node_read_err_must_retry,
-                         c, ca, b, NULL, NULL,
-                         btree_node_fault_injected,
-                         "dynamic fault");
-
-       btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-                    -BCH_ERR_btree_node_read_err_must_retry,
-                    c, ca, b, NULL, NULL,
-                    btree_node_bad_magic,
-                    "bad magic: want %llx, got %llx",
-                    bset_magic(c), le64_to_cpu(b->data->magic));
-
-       if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-               struct bch_btree_ptr_v2 *bp =
-                       &bkey_i_to_btree_ptr_v2(&b->key)->v;
-
-               bch2_bpos_to_text(&buf, b->data->min_key);
-               prt_str(&buf, "-");
-               bch2_bpos_to_text(&buf, b->data->max_key);
-
-               btree_err_on(b->data->keys.seq != bp->seq,
-                            -BCH_ERR_btree_node_read_err_must_retry,
-                            c, ca, b, NULL, NULL,
-                            btree_node_bad_seq,
-                            "got wrong btree node: got\n%s",
-                            (printbuf_reset(&buf),
-                             bch2_btree_node_header_to_text(&buf, b->data),
-                             buf.buf));
-       } else {
-               btree_err_on(!b->data->keys.seq,
-                            -BCH_ERR_btree_node_read_err_must_retry,
-                            c, ca, b, NULL, NULL,
-                            btree_node_bad_seq,
-                            "bad btree header: seq 0\n%s",
-                            (printbuf_reset(&buf),
-                             bch2_btree_node_header_to_text(&buf, b->data),
-                             buf.buf));
-       }
-
-       while (b->written < (ptr_written ?: btree_sectors(c))) {
-               unsigned sectors;
-               bool first = !b->written;
-
-               if (first) {
-                       bne = NULL;
-                       i = &b->data->keys;
-               } else {
-                       bne = write_block(b);
-                       i = &bne->keys;
-
-                       if (i->seq != b->data->keys.seq)
-                               break;
-               }
-
-               struct nonce nonce = btree_nonce(i, b->written << 9);
-               bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
-
-               btree_err_on(!good_csum_type,
-                            bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
-                            ? -BCH_ERR_btree_node_read_err_must_retry
-                            : -BCH_ERR_btree_node_read_err_want_retry,
-                            c, ca, b, i, NULL,
-                            bset_unknown_csum,
-                            "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
-               if (first) {
-                       sectors = vstruct_sectors(b->data, c->block_bits);
-                       if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
-                                        -BCH_ERR_btree_node_read_err_fixable,
-                                        c, ca, b, i, NULL,
-                                        bset_past_end_of_btree_node,
-                                        "bset past end of btree node (offset %u len %u but written %zu)",
-                                        b->written, sectors, ptr_written ?: btree_sectors(c)))
-                               i->u64s = 0;
-                       if (good_csum_type) {
-                               struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
-                               bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
-                               if (csum_bad)
-                                       bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
-                               btree_err_on(csum_bad,
-                                            -BCH_ERR_btree_node_read_err_want_retry,
-                                            c, ca, b, i, NULL,
-                                            bset_bad_csum,
-                                            "%s",
-                                            (printbuf_reset(&buf),
-                                             bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
-                                             buf.buf));
-
-                               ret = bset_encrypt(c, i, b->written << 9);
-                               if (bch2_fs_fatal_err_on(ret, c,
-                                                        "decrypting btree node: %s", bch2_err_str(ret)))
-                                       goto fsck_err;
-                       }
-
-                       btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
-                                    !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-                                    -BCH_ERR_btree_node_read_err_incompatible,
-                                    c, NULL, b, NULL, NULL,
-                                    btree_node_unsupported_version,
-                                    "btree node does not have NEW_EXTENT_OVERWRITE set");
-               } else {
-                       sectors = vstruct_sectors(bne, c->block_bits);
-                       if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
-                                        -BCH_ERR_btree_node_read_err_fixable,
-                                        c, ca, b, i, NULL,
-                                        bset_past_end_of_btree_node,
-                                        "bset past end of btree node (offset %u len %u but written %zu)",
-                                        b->written, sectors, ptr_written ?: btree_sectors(c)))
-                               i->u64s = 0;
-                       if (good_csum_type) {
-                               struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-                               bool csum_bad = bch2_crc_cmp(bne->csum, csum);
-                               if (ca && csum_bad)
-                                       bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
-                               btree_err_on(csum_bad,
-                                            -BCH_ERR_btree_node_read_err_want_retry,
-                                            c, ca, b, i, NULL,
-                                            bset_bad_csum,
-                                            "%s",
-                                            (printbuf_reset(&buf),
-                                             bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
-                                             buf.buf));
-
-                               ret = bset_encrypt(c, i, b->written << 9);
-                               if (bch2_fs_fatal_err_on(ret, c,
-                                               "decrypting btree node: %s", bch2_err_str(ret)))
-                                       goto fsck_err;
-                       }
-               }
-
-               b->version_ondisk = min(b->version_ondisk,
-                                       le16_to_cpu(i->version));
-
-               ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg);
-               if (ret)
-                       goto fsck_err;
-
-               if (!b->written)
-                       btree_node_set_format(b, b->data->format);
-
-               ret = validate_bset_keys(c, b, i, READ, failed, err_msg);
-               if (ret)
-                       goto fsck_err;
-
-               SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
-               blacklisted = bch2_journal_seq_is_blacklisted(c,
-                                       le64_to_cpu(i->journal_seq),
-                                       true);
-
-               btree_err_on(blacklisted && first,
-                            -BCH_ERR_btree_node_read_err_fixable,
-                            c, ca, b, i, NULL,
-                            bset_blacklisted_journal_seq,
-                            "first btree node bset has blacklisted journal seq (%llu)",
-                            le64_to_cpu(i->journal_seq));
-
-               btree_err_on(blacklisted && ptr_written,
-                            -BCH_ERR_btree_node_read_err_fixable,
-                            c, ca, b, i, NULL,
-                            first_bset_blacklisted_journal_seq,
-                            "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
-                            le64_to_cpu(i->journal_seq),
-                            b->written, b->written + sectors, ptr_written);
-
-               b->written = min(b->written + sectors, btree_sectors(c));
-
-               if (blacklisted && !first)
-                       continue;
-
-               sort_iter_add(iter,
-                             vstruct_idx(i, 0),
-                             vstruct_last(i));
-
-               max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
-       }
-
-       if (ptr_written) {
-               btree_err_on(b->written < ptr_written,
-                            -BCH_ERR_btree_node_read_err_want_retry,
-                            c, ca, b, NULL, NULL,
-                            btree_node_data_missing,
-                            "btree node data missing: expected %u sectors, found %u",
-                            ptr_written, b->written);
-       } else {
-               for (bne = write_block(b);
-                    bset_byte_offset(b, bne) < btree_buf_bytes(b);
-                    bne = (void *) bne + block_bytes(c))
-                       btree_err_on(bne->keys.seq == b->data->keys.seq &&
-                                    !bch2_journal_seq_is_blacklisted(c,
-                                                                     le64_to_cpu(bne->keys.journal_seq),
-                                                                     true),
-                                    -BCH_ERR_btree_node_read_err_want_retry,
-                                    c, ca, b, NULL, NULL,
-                                    btree_node_bset_after_end,
-                                    "found bset signature after last bset");
-       }
-
-       sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
-       sorted->keys.u64s = 0;
-
-       b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
-       memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
-                       btree_buf_bytes(b) -
-                       sizeof(struct btree_node) -
-                       b->nr.live_u64s * sizeof(u64));
-
-       b->data->keys.u64s = sorted->keys.u64s;
-       *sorted = *b->data;
-       swap(sorted, b->data);
-       set_btree_bset(b, b->set, &b->data->keys);
-       b->nsets = 1;
-       b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
-
-       BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s));
-
-       btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
-
-       i = &b->data->keys;
-       for (k = i->start; k != vstruct_last(i);) {
-               struct bkey tmp;
-               struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-
-               ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
-               if (ret == -BCH_ERR_fsck_delete_bkey ||
-                   (static_branch_unlikely(&bch2_inject_invalid_keys) &&
-                    !bversion_cmp(u.k->bversion, MAX_VERSION))) {
-                       btree_keys_account_key_drop(&b->nr, 0, k);
-
-                       i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
-                       memmove_u64s_down(k, bkey_p_next(k),
-                                         (u64 *) vstruct_end(i) - (u64 *) k);
-                       set_btree_bset_end(b, b->set);
-                       set_btree_node_need_rewrite(b);
-                       set_btree_node_need_rewrite_error(b);
-                       continue;
-               }
-               if (ret)
-                       goto fsck_err;
-
-               if (u.k->type == KEY_TYPE_btree_ptr_v2) {
-                       struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
-
-                       bp.v->mem_ptr = 0;
-               }
-
-               k = bkey_p_next(k);
-       }
-
-       bch2_bset_build_aux_tree(b, b->set, false);
-
-       set_needs_whiteout(btree_bset_first(b), true);
-
-       btree_node_reset_sib_u64s(b);
-
-       if (updated_range)
-               bch2_btree_node_drop_keys_outside_node(b);
-
-       /*
-        * XXX:
-        *
-        * We deadlock if too many btree updates require node rewrites while
-        * we're still in journal replay.
-        *
-        * This is because btree node rewrites generate more updates for the
-        * interior updates (alloc, backpointers), and if those updates touch
-        * new nodes and generate more rewrites - well, you see the problem.
-        *
-        * The biggest cause is that we don't use the btree write buffer (for
-        * the backpointer updates - this needs some real thought on locking in
-        * order to fix.
-        *
-        * The problem with this workaround (not doing the rewrite for degraded
-        * nodes in journal replay) is that those degraded nodes persist, and we
-        * don't want that (this is a real bug when a btree node write completes
-        * with fewer replicas than we wanted and leaves a degraded node due to
-        * device _removal_, i.e. the device went away mid write).
-        *
-        * It's less of a bug here, but still a problem because we don't yet
-        * have a way of tracking degraded data - we another index (all
-        * extents/btree nodes, by replicas entry) in order to fix properly
-        * (re-replicate degraded data at the earliest possible time).
-        */
-       if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
-               scoped_guard(rcu)
-                       bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-                               struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
-                               if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
-                                       set_btree_node_need_rewrite(b);
-                                       set_btree_node_need_rewrite_degraded(b);
-                               }
-                       }
-       }
-
-       if (!ptr_written) {
-               set_btree_node_need_rewrite(b);
-               set_btree_node_need_rewrite_ptr_written_zero(b);
-       }
-fsck_err:
-       mempool_free(iter, &c->fill_iter);
-       printbuf_exit(&buf);
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
-       return ret;
-}
-
-static void btree_node_read_work(struct work_struct *work)
-{
-       struct btree_read_bio *rb =
-               container_of(work, struct btree_read_bio, work);
-       struct bch_fs *c        = rb->c;
-       struct bch_dev *ca      = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
-       struct btree *b         = rb->b;
-       struct bio *bio         = &rb->bio;
-       struct bch_io_failures failed = { .nr = 0 };
-       int ret = 0;
-
-       struct printbuf buf = PRINTBUF;
-       bch2_log_msg_start(c, &buf);
-
-       prt_printf(&buf, "btree node read error at btree ");
-       bch2_btree_pos_to_text(&buf, c, b);
-       prt_newline(&buf);
-
-       goto start;
-       while (1) {
-               ret = bch2_bkey_pick_read_device(c,
-                                       bkey_i_to_s_c(&b->key),
-                                       &failed, &rb->pick, -1);
-               if (ret <= 0) {
-                       set_btree_node_read_error(b);
-                       break;
-               }
-
-               ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
-               rb->have_ioref          = ca != NULL;
-               rb->start_time          = local_clock();
-               bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
-               bio->bi_iter.bi_sector  = rb->pick.ptr.offset;
-               bio->bi_iter.bi_size    = btree_buf_bytes(b);
-
-               if (rb->have_ioref) {
-                       bio_set_dev(bio, ca->disk_sb.bdev);
-                       submit_bio_wait(bio);
-               } else {
-                       bio->bi_status = BLK_STS_REMOVED;
-               }
-
-               bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
-                                          rb->start_time, !bio->bi_status);
-start:
-               if (rb->have_ioref)
-                       enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read);
-               rb->have_ioref = false;
-
-               if (bio->bi_status) {
-                       bch2_mark_io_failure(&failed, &rb->pick, false);
-                       continue;
-               }
-
-               ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
-               if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
-                   ret == -BCH_ERR_btree_node_read_err_must_retry)
-                       continue;
-
-               if (ret)
-                       set_btree_node_read_error(b);
-
-               break;
-       }
-
-       bch2_io_failures_to_text(&buf, c, &failed);
-
-       if (btree_node_read_error(b))
-               bch2_btree_lost_data(c, &buf, b->c.btree_id);
-
-       /*
-        * only print retry success if we read from a replica with no errors
-        */
-       if (btree_node_read_error(b))
-               prt_printf(&buf, "ret %s", bch2_err_str(ret));
-       else if (failed.nr) {
-               if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
-                       prt_printf(&buf, "retry success");
-               else
-                       prt_printf(&buf, "repair success");
-       }
-
-       if ((failed.nr ||
-            btree_node_need_rewrite(b)) &&
-           !btree_node_read_error(b) &&
-           c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
-               prt_printf(&buf, " (rewriting node)");
-               bch2_btree_node_rewrite_async(c, b);
-       }
-       prt_newline(&buf);
-
-       if (failed.nr)
-               bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
-
-       async_object_list_del(c, btree_read_bio, rb->list_idx);
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
-                              rb->start_time);
-       bio_put(&rb->bio);
-       printbuf_exit(&buf);
-       clear_btree_node_read_in_flight(b);
-       smp_mb__after_atomic();
-       wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-static void btree_node_read_endio(struct bio *bio)
-{
-       struct btree_read_bio *rb =
-               container_of(bio, struct btree_read_bio, bio);
-       struct bch_fs *c        = rb->c;
-       struct bch_dev *ca      = rb->have_ioref
-               ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
-
-       bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
-                                  rb->start_time, !bio->bi_status);
-
-       queue_work(c->btree_read_complete_wq, &rb->work);
-}
-
-void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio)
-{
-       bch2_bio_to_text(out, &rbio->bio);
-}
-
-struct btree_node_read_all {
-       struct closure          cl;
-       struct bch_fs           *c;
-       struct btree            *b;
-       unsigned                nr;
-       void                    *buf[BCH_REPLICAS_MAX];
-       struct bio              *bio[BCH_REPLICAS_MAX];
-       blk_status_t            err[BCH_REPLICAS_MAX];
-};
-
-static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
-{
-       struct btree_node *bn = data;
-       struct btree_node_entry *bne;
-       unsigned offset = 0;
-
-       if (le64_to_cpu(bn->magic) !=  bset_magic(c))
-               return 0;
-
-       while (offset < btree_sectors(c)) {
-               if (!offset) {
-                       offset += vstruct_sectors(bn, c->block_bits);
-               } else {
-                       bne = data + (offset << 9);
-                       if (bne->keys.seq != bn->keys.seq)
-                               break;
-                       offset += vstruct_sectors(bne, c->block_bits);
-               }
-       }
-
-       return offset;
-}
-
-static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
-{
-       struct btree_node *bn = data;
-       struct btree_node_entry *bne;
-
-       if (!offset)
-               return false;
-
-       while (offset < btree_sectors(c)) {
-               bne = data + (offset << 9);
-               if (bne->keys.seq == bn->keys.seq)
-                       return true;
-               offset++;
-       }
-
-       return false;
-       return offset;
-}
-
-static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
-{
-       closure_type(ra, struct btree_node_read_all, cl);
-       struct bch_fs *c = ra->c;
-       struct btree *b = ra->b;
-       struct printbuf buf = PRINTBUF;
-       bool dump_bset_maps = false;
-       int ret = 0, best = -1, write = READ;
-       unsigned i, written = 0, written2 = 0;
-       __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
-               ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
-       bool _saw_error = false, *saw_error = &_saw_error;
-       struct printbuf *err_msg = NULL;
-       struct bch_io_failures *failed = NULL;
-
-       for (i = 0; i < ra->nr; i++) {
-               struct btree_node *bn = ra->buf[i];
-
-               if (ra->err[i])
-                       continue;
-
-               if (le64_to_cpu(bn->magic) != bset_magic(c) ||
-                   (seq && seq != bn->keys.seq))
-                       continue;
-
-               if (best < 0) {
-                       best = i;
-                       written = btree_node_sectors_written(c, bn);
-                       continue;
-               }
-
-               written2 = btree_node_sectors_written(c, ra->buf[i]);
-               if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
-                                c, NULL, b, NULL, NULL,
-                                btree_node_replicas_sectors_written_mismatch,
-                                "btree node sectors written mismatch: %u != %u",
-                                written, written2) ||
-                   btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-                                -BCH_ERR_btree_node_read_err_fixable,
-                                c, NULL, b, NULL, NULL,
-                                btree_node_bset_after_end,
-                                "found bset signature after last bset") ||
-                   btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-                                -BCH_ERR_btree_node_read_err_fixable,
-                                c, NULL, b, NULL, NULL,
-                                btree_node_replicas_data_mismatch,
-                                "btree node replicas content mismatch"))
-                       dump_bset_maps = true;
-
-               if (written2 > written) {
-                       written = written2;
-                       best = i;
-               }
-       }
-fsck_err:
-       if (dump_bset_maps) {
-               for (i = 0; i < ra->nr; i++) {
-                       struct btree_node *bn = ra->buf[i];
-                       struct btree_node_entry *bne = NULL;
-                       unsigned offset = 0, sectors;
-                       bool gap = false;
-
-                       if (ra->err[i])
-                               continue;
-
-                       printbuf_reset(&buf);
-
-                       while (offset < btree_sectors(c)) {
-                               if (!offset) {
-                                       sectors = vstruct_sectors(bn, c->block_bits);
-                               } else {
-                                       bne = ra->buf[i] + (offset << 9);
-                                       if (bne->keys.seq != bn->keys.seq)
-                                               break;
-                                       sectors = vstruct_sectors(bne, c->block_bits);
-                               }
-
-                               prt_printf(&buf, " %u-%u", offset, offset + sectors);
-                               if (bne && bch2_journal_seq_is_blacklisted(c,
-                                                       le64_to_cpu(bne->keys.journal_seq), false))
-                                       prt_printf(&buf, "*");
-                               offset += sectors;
-                       }
-
-                       while (offset < btree_sectors(c)) {
-                               bne = ra->buf[i] + (offset << 9);
-                               if (bne->keys.seq == bn->keys.seq) {
-                                       if (!gap)
-                                               prt_printf(&buf, " GAP");
-                                       gap = true;
-
-                                       sectors = vstruct_sectors(bne, c->block_bits);
-                                       prt_printf(&buf, " %u-%u", offset, offset + sectors);
-                                       if (bch2_journal_seq_is_blacklisted(c,
-                                                       le64_to_cpu(bne->keys.journal_seq), false))
-                                               prt_printf(&buf, "*");
-                               }
-                               offset++;
-                       }
-
-                       bch_err(c, "replica %u:%s", i, buf.buf);
-               }
-       }
-
-       if (best >= 0) {
-               memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
-               ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL);
-       } else {
-               ret = -1;
-       }
-
-       if (ret) {
-               set_btree_node_read_error(b);
-
-               struct printbuf buf = PRINTBUF;
-               bch2_btree_lost_data(c, &buf, b->c.btree_id);
-               if (buf.pos)
-                       bch_err(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-       } else if (*saw_error)
-               bch2_btree_node_rewrite_async(c, b);
-
-       for (i = 0; i < ra->nr; i++) {
-               mempool_free(ra->buf[i], &c->btree_bounce_pool);
-               bio_put(ra->bio[i]);
-       }
-
-       closure_debug_destroy(&ra->cl);
-       kfree(ra);
-       printbuf_exit(&buf);
-
-       clear_btree_node_read_in_flight(b);
-       smp_mb__after_atomic();
-       wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-static void btree_node_read_all_replicas_endio(struct bio *bio)
-{
-       struct btree_read_bio *rb =
-               container_of(bio, struct btree_read_bio, bio);
-       struct bch_fs *c        = rb->c;
-       struct btree_node_read_all *ra = rb->ra;
-
-       if (rb->have_ioref) {
-               struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
-               bch2_latency_acct(ca, rb->start_time, READ);
-               enumerated_ref_put(&ca->io_ref[READ],
-                       BCH_DEV_READ_REF_btree_node_read_all_replicas);
-       }
-
-       ra->err[rb->idx] = bio->bi_status;
-       closure_put(&ra->cl);
-}
-
-/*
- * XXX This allocates multiple times from the same mempools, and can deadlock
- * under sufficient memory pressure (but is only a debug path)
- */
-static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
-{
-       struct bkey_s_c k = bkey_i_to_s_c(&b->key);
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded pick;
-       struct btree_node_read_all *ra;
-       unsigned i;
-
-       ra = kzalloc(sizeof(*ra), GFP_NOFS);
-       if (!ra)
-               return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas);
-
-       closure_init(&ra->cl, NULL);
-       ra->c   = c;
-       ra->b   = b;
-       ra->nr  = bch2_bkey_nr_ptrs(k);
-
-       for (i = 0; i < ra->nr; i++) {
-               ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
-               ra->bio[i] = bio_alloc_bioset(NULL,
-                                             buf_pages(ra->buf[i], btree_buf_bytes(b)),
-                                             REQ_OP_READ|REQ_SYNC|REQ_META,
-                                             GFP_NOFS,
-                                             &c->btree_bio);
-       }
-
-       i = 0;
-       bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
-               struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
-                                       BCH_DEV_READ_REF_btree_node_read_all_replicas);
-               struct btree_read_bio *rb =
-                       container_of(ra->bio[i], struct btree_read_bio, bio);
-               rb->c                   = c;
-               rb->b                   = b;
-               rb->ra                  = ra;
-               rb->start_time          = local_clock();
-               rb->have_ioref          = ca != NULL;
-               rb->idx                 = i;
-               rb->pick                = pick;
-               rb->bio.bi_iter.bi_sector = pick.ptr.offset;
-               rb->bio.bi_end_io       = btree_node_read_all_replicas_endio;
-               bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
-
-               if (rb->have_ioref) {
-                       this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
-                                    bio_sectors(&rb->bio));
-                       bio_set_dev(&rb->bio, ca->disk_sb.bdev);
-
-                       closure_get(&ra->cl);
-                       submit_bio(&rb->bio);
-               } else {
-                       ra->err[i] = BLK_STS_REMOVED;
-               }
-
-               i++;
-       }
-
-       if (sync) {
-               closure_sync(&ra->cl);
-               btree_node_read_all_replicas_done(&ra->cl.work);
-       } else {
-               continue_at(&ra->cl, btree_node_read_all_replicas_done,
-                           c->btree_read_complete_wq);
-       }
-
-       return 0;
-}
-
-void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
-                         bool sync)
-{
-       struct bch_fs *c = trans->c;
-       struct extent_ptr_decoded pick;
-       struct btree_read_bio *rb;
-       struct bch_dev *ca;
-       struct bio *bio;
-       int ret;
-
-       trace_and_count(c, btree_node_read, trans, b);
-
-       if (static_branch_unlikely(&bch2_verify_all_btree_replicas) &&
-           !btree_node_read_all_replicas(c, b, sync))
-               return;
-
-       ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
-                                        NULL, &pick, -1);
-
-       if (ret <= 0) {
-               bool ratelimit = true;
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-
-               prt_str(&buf, "btree node read error: no device to read from\n at ");
-               bch2_btree_pos_to_text(&buf, c, b);
-               prt_newline(&buf);
-               bch2_btree_lost_data(c, &buf, b->c.btree_id);
-
-               if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
-                   bch2_fs_emergency_read_only2(c, &buf))
-                       ratelimit = false;
-
-               static DEFINE_RATELIMIT_STATE(rs,
-                                             DEFAULT_RATELIMIT_INTERVAL,
-                                             DEFAULT_RATELIMIT_BURST);
-               if (!ratelimit || __ratelimit(&rs))
-                       bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-
-               set_btree_node_read_error(b);
-               clear_btree_node_read_in_flight(b);
-               smp_mb__after_atomic();
-               wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-               return;
-       }
-
-       ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
-
-       bio = bio_alloc_bioset(NULL,
-                              buf_pages(b->data, btree_buf_bytes(b)),
-                              REQ_OP_READ|REQ_SYNC|REQ_META,
-                              GFP_NOFS,
-                              &c->btree_bio);
-       rb = container_of(bio, struct btree_read_bio, bio);
-       rb->c                   = c;
-       rb->b                   = b;
-       rb->ra                  = NULL;
-       rb->start_time          = local_clock();
-       rb->have_ioref          = ca != NULL;
-       rb->pick                = pick;
-       INIT_WORK(&rb->work, btree_node_read_work);
-       bio->bi_iter.bi_sector  = pick.ptr.offset;
-       bio->bi_end_io          = btree_node_read_endio;
-       bch2_bio_map(bio, b->data, btree_buf_bytes(b));
-
-       async_object_list_add(c, btree_read_bio, rb, &rb->list_idx);
-
-       if (rb->have_ioref) {
-               this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
-                            bio_sectors(bio));
-               bio_set_dev(bio, ca->disk_sb.bdev);
-
-               if (sync) {
-                       submit_bio_wait(bio);
-                       bch2_latency_acct(ca, rb->start_time, READ);
-                       btree_node_read_work(&rb->work);
-               } else {
-                       submit_bio(bio);
-               }
-       } else {
-               bio->bi_status = BLK_STS_REMOVED;
-
-               if (sync)
-                       btree_node_read_work(&rb->work);
-               else
-                       queue_work(c->btree_read_complete_wq, &rb->work);
-       }
-}
-
-static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
-                                 const struct bkey_i *k, unsigned level)
-{
-       struct bch_fs *c = trans->c;
-       struct closure cl;
-       struct btree *b;
-       int ret;
-
-       closure_init_stack(&cl);
-
-       do {
-               ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-               closure_sync(&cl);
-       } while (ret);
-
-       b = bch2_btree_node_mem_alloc(trans, level != 0);
-       bch2_btree_cache_cannibalize_unlock(trans);
-
-       BUG_ON(IS_ERR(b));
-
-       bkey_copy(&b->key, k);
-       BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
-
-       set_btree_node_read_in_flight(b);
-
-       /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
-       bch2_trans_unlock(trans);
-       bch2_btree_node_read(trans, b, true);
-
-       if (btree_node_read_error(b)) {
-               mutex_lock(&c->btree_cache.lock);
-               bch2_btree_node_hash_remove(&c->btree_cache, b);
-               mutex_unlock(&c->btree_cache.lock);
-
-               ret = bch_err_throw(c, btree_node_read_error);
-               goto err;
-       }
-
-       bch2_btree_set_root_for_read(c, b);
-err:
-       six_unlock_write(&b->c.lock);
-       six_unlock_intent(&b->c.lock);
-
-       return ret;
-}
-
-int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
-                       const struct bkey_i *k, unsigned level)
-{
-       return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
-}
-
-struct btree_node_scrub {
-       struct bch_fs           *c;
-       struct bch_dev          *ca;
-       void                    *buf;
-       bool                    used_mempool;
-       unsigned                written;
-
-       enum btree_id           btree;
-       unsigned                level;
-       struct bkey_buf         key;
-       __le64                  seq;
-
-       struct work_struct      work;
-       struct bio              bio;
-};
-
-static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
-                                  struct printbuf *err)
-{
-       unsigned written = 0;
-
-       if (le64_to_cpu(data->magic) != bset_magic(c)) {
-               prt_printf(err, "bad magic: want %llx, got %llx",
-                          bset_magic(c), le64_to_cpu(data->magic));
-               return false;
-       }
-
-       while (written < (ptr_written ?: btree_sectors(c))) {
-               struct btree_node_entry *bne;
-               struct bset *i;
-               bool first = !written;
-
-               if (first) {
-                       bne = NULL;
-                       i = &data->keys;
-               } else {
-                       bne = (void *) data + (written << 9);
-                       i = &bne->keys;
-
-                       if (!ptr_written && i->seq != data->keys.seq)
-                               break;
-               }
-
-               struct nonce nonce = btree_nonce(i, written << 9);
-               bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
-
-               if (first) {
-                       if (good_csum_type) {
-                               struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
-                               if (bch2_crc_cmp(data->csum, csum)) {
-                                       bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
-                                       return false;
-                               }
-                       }
-
-                       written += vstruct_sectors(data, c->block_bits);
-               } else {
-                       if (good_csum_type) {
-                               struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-                               if (bch2_crc_cmp(bne->csum, csum)) {
-                                       bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
-                                       return false;
-                               }
-                       }
-
-                       written += vstruct_sectors(bne, c->block_bits);
-               }
-       }
-
-       return true;
-}
-
-static void btree_node_scrub_work(struct work_struct *work)
-{
-       struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
-       struct bch_fs *c = scrub->c;
-       struct printbuf err = PRINTBUF;
-
-       __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
-                                bkey_i_to_s_c(scrub->key.k));
-       prt_newline(&err);
-
-       if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
-               int ret = bch2_trans_do(c,
-                       bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1,
-                                                   scrub->key.k, 0));
-               if (!bch2_err_matches(ret, ENOENT) &&
-                   !bch2_err_matches(ret, EROFS))
-                       bch_err_fn_ratelimited(c, ret);
-       }
-
-       printbuf_exit(&err);
-       bch2_bkey_buf_exit(&scrub->key, c);;
-       btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
-       enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
-       kfree(scrub);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
-}
-
-static void btree_node_scrub_endio(struct bio *bio)
-{
-       struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
-
-       queue_work(scrub->c->btree_read_complete_wq, &scrub->work);
-}
-
-int bch2_btree_node_scrub(struct btree_trans *trans,
-                         enum btree_id btree, unsigned level,
-                         struct bkey_s_c k, unsigned dev)
-{
-       if (k.k->type != KEY_TYPE_btree_ptr_v2)
-               return 0;
-
-       struct bch_fs *c = trans->c;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
-               return bch_err_throw(c, erofs_no_writes);
-
-       struct extent_ptr_decoded pick;
-       int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
-       if (ret <= 0)
-               goto err;
-
-       struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
-                                               BCH_DEV_READ_REF_btree_node_scrub);
-       if (!ca) {
-               ret = bch_err_throw(c, device_offline);
-               goto err;
-       }
-
-       bool used_mempool = false;
-       void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
-
-       unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
-
-       struct btree_node_scrub *scrub =
-               kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
-       if (!scrub) {
-               ret = -ENOMEM;
-               goto err_free;
-       }
-
-       scrub->c                = c;
-       scrub->ca               = ca;
-       scrub->buf              = buf;
-       scrub->used_mempool     = used_mempool;
-       scrub->written          = btree_ptr_sectors_written(k);
-
-       scrub->btree            = btree;
-       scrub->level            = level;
-       bch2_bkey_buf_init(&scrub->key);
-       bch2_bkey_buf_reassemble(&scrub->key, c, k);
-       scrub->seq              = bkey_s_c_to_btree_ptr_v2(k).v->seq;
-
-       INIT_WORK(&scrub->work, btree_node_scrub_work);
-
-       bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
-       bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
-       scrub->bio.bi_iter.bi_sector    = pick.ptr.offset;
-       scrub->bio.bi_end_io            = btree_node_scrub_endio;
-       submit_bio(&scrub->bio);
-       return 0;
-err_free:
-       btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
-       enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
-err:
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
-       return ret;
-}
-
-static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
-                                     struct btree_write *w)
-{
-       unsigned long old, new;
-
-       old = READ_ONCE(b->will_make_reachable);
-       do {
-               new = old;
-               if (!(old & 1))
-                       break;
-
-               new &= ~1UL;
-       } while (!try_cmpxchg(&b->will_make_reachable, &old, new));
-
-       if (old & 1)
-               closure_put(&((struct btree_update *) new)->cl);
-
-       bch2_journal_pin_drop(&c->journal, &w->journal);
-}
-
-static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
-{
-       struct btree_write *w = btree_prev_write(b);
-       unsigned long old, new;
-       unsigned type = 0;
-
-       bch2_btree_complete_write(c, b, w);
-
-       if (start_time)
-               bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time);
-
-       old = READ_ONCE(b->flags);
-       do {
-               new = old;
-
-               if ((old & (1U << BTREE_NODE_dirty)) &&
-                   (old & (1U << BTREE_NODE_need_write)) &&
-                   !(old & (1U << BTREE_NODE_never_write)) &&
-                   !(old & (1U << BTREE_NODE_write_blocked)) &&
-                   !(old & (1U << BTREE_NODE_will_make_reachable))) {
-                       new &= ~(1U << BTREE_NODE_dirty);
-                       new &= ~(1U << BTREE_NODE_need_write);
-                       new |=  (1U << BTREE_NODE_write_in_flight);
-                       new |=  (1U << BTREE_NODE_write_in_flight_inner);
-                       new |=  (1U << BTREE_NODE_just_written);
-                       new ^=  (1U << BTREE_NODE_write_idx);
-
-                       type = new & BTREE_WRITE_TYPE_MASK;
-                       new &= ~BTREE_WRITE_TYPE_MASK;
-               } else {
-                       new &= ~(1U << BTREE_NODE_write_in_flight);
-                       new &= ~(1U << BTREE_NODE_write_in_flight_inner);
-               }
-       } while (!try_cmpxchg(&b->flags, &old, new));
-
-       if (new & (1U << BTREE_NODE_write_in_flight))
-               __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
-       else {
-               smp_mb__after_atomic();
-               wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-       }
-}
-
-static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-
-       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-
-       /* we don't need transaction context anymore after we got the lock. */
-       bch2_trans_put(trans);
-       __btree_node_write_done(c, b, start_time);
-       six_unlock_read(&b->c.lock);
-}
-
-static void btree_node_write_work(struct work_struct *work)
-{
-       struct btree_write_bio *wbio =
-               container_of(work, struct btree_write_bio, work);
-       struct bch_fs *c        = wbio->wbio.c;
-       struct btree *b         = wbio->wbio.bio.bi_private;
-       u64 start_time          = wbio->start_time;
-       int ret = 0;
-
-       btree_bounce_free(c,
-               wbio->data_bytes,
-               wbio->wbio.used_mempool,
-               wbio->data);
-
-       bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
-               bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
-
-       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
-               ret = bch_err_throw(c, btree_node_write_all_failed);
-               goto err;
-       }
-
-       if (wbio->wbio.first_btree_write) {
-               if (wbio->wbio.failed.nr) {
-
-               }
-       } else {
-               ret = bch2_trans_do(c,
-                       bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
-                                       BCH_WATERMARK_interior_updates|
-                                       BCH_TRANS_COMMIT_journal_reclaim|
-                                       BCH_TRANS_COMMIT_no_enospc|
-                                       BCH_TRANS_COMMIT_no_check_rw,
-                                       !wbio->wbio.failed.nr));
-               if (ret)
-                       goto err;
-       }
-out:
-       async_object_list_del(c, btree_write_bio, wbio->list_idx);
-       bio_put(&wbio->wbio.bio);
-       btree_node_write_done(c, b, start_time);
-       return;
-err:
-       set_btree_node_noevict(b);
-
-       if (!bch2_err_matches(ret, EROFS)) {
-               struct printbuf buf = PRINTBUF;
-               prt_printf(&buf, "writing btree node: %s\n  ", bch2_err_str(ret));
-               bch2_btree_pos_to_text(&buf, c, b);
-               bch2_fs_fatal_error(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-       }
-       goto out;
-}
-
-static void btree_node_write_endio(struct bio *bio)
-{
-       struct bch_write_bio *wbio      = to_wbio(bio);
-       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
-       struct bch_write_bio *orig      = parent ?: wbio;
-       struct btree_write_bio *wb      = container_of(orig, struct btree_write_bio, wbio);
-       struct bch_fs *c                = wbio->c;
-       struct btree *b                 = wbio->bio.bi_private;
-       struct bch_dev *ca              = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
-
-       bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
-                                  wbio->submit_time, !bio->bi_status);
-
-       if (ca && bio->bi_status) {
-               struct printbuf buf = PRINTBUF;
-               buf.atomic++;
-               prt_printf(&buf, "btree write error: %s\n  ",
-                          bch2_blk_status_to_str(bio->bi_status));
-               bch2_btree_pos_to_text(&buf, c, b);
-               bch_err_dev_ratelimited(ca, "%s", buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       if (bio->bi_status) {
-               unsigned long flags;
-               spin_lock_irqsave(&c->btree_write_error_lock, flags);
-               bch2_dev_list_add_dev(&orig->failed, wbio->dev);
-               spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
-       }
-
-       /*
-        * XXX: we should be using io_ref[WRITE], but we aren't retrying failed
-        * btree writes yet (due to device removal/ro):
-        */
-       if (wbio->have_ioref)
-               enumerated_ref_put(&ca->io_ref[READ],
-                                  BCH_DEV_READ_REF_btree_node_write);
-
-       if (parent) {
-               bio_put(bio);
-               bio_endio(&parent->bio);
-               return;
-       }
-
-       clear_btree_node_write_in_flight_inner(b);
-       smp_mb__after_atomic();
-       wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
-       INIT_WORK(&wb->work, btree_node_write_work);
-       queue_work(c->btree_write_complete_wq, &wb->work);
-}
-
-static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
-                                  struct bset *i)
-{
-       int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
-                                    (struct bkey_validate_context) {
-                                       .from   = BKEY_VALIDATE_btree_node,
-                                       .level  = b->c.level + 1,
-                                       .btree  = b->c.btree_id,
-                                       .flags  = BCH_VALIDATE_write,
-                                    });
-       if (ret) {
-               bch2_fs_inconsistent(c, "invalid btree node key before write");
-               return ret;
-       }
-
-       ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
-               validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL);
-       if (ret) {
-               bch2_inconsistent_error(c);
-               dump_stack();
-       }
-
-       return ret;
-}
-
-static void btree_write_submit(struct work_struct *work)
-{
-       struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
-       BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-
-       bkey_copy(&tmp.k, &wbio->key);
-
-       bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
-               ptr->offset += wbio->sector_offset;
-
-       bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
-                                 &tmp.k, false);
-}
-
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
-{
-       struct btree_write_bio *wbio;
-       struct bset *i;
-       struct btree_node *bn = NULL;
-       struct btree_node_entry *bne = NULL;
-       struct sort_iter_stack sort_iter;
-       struct nonce nonce;
-       unsigned bytes_to_write, sectors_to_write, bytes, u64s;
-       u64 seq = 0;
-       bool used_mempool;
-       unsigned long old, new;
-       bool validate_before_checksum = false;
-       enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
-       void *data;
-       u64 start_time = local_clock();
-       int ret;
-
-       if (flags & BTREE_WRITE_ALREADY_STARTED)
-               goto do_write;
-
-       /*
-        * We may only have a read lock on the btree node - the dirty bit is our
-        * "lock" against racing with other threads that may be trying to start
-        * a write, we do a write iff we clear the dirty bit. Since setting the
-        * dirty bit requires a write lock, we can't race with other threads
-        * redirtying it:
-        */
-       old = READ_ONCE(b->flags);
-       do {
-               new = old;
-
-               if (!(old & (1 << BTREE_NODE_dirty)))
-                       return;
-
-               if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
-                   !(old & (1 << BTREE_NODE_need_write)))
-                       return;
-
-               if (old &
-                   ((1 << BTREE_NODE_never_write)|
-                    (1 << BTREE_NODE_write_blocked)))
-                       return;
-
-               if (b->written &&
-                   (old & (1 << BTREE_NODE_will_make_reachable)))
-                       return;
-
-               if (old & (1 << BTREE_NODE_write_in_flight))
-                       return;
-
-               if (flags & BTREE_WRITE_ONLY_IF_NEED)
-                       type = new & BTREE_WRITE_TYPE_MASK;
-               new &= ~BTREE_WRITE_TYPE_MASK;
-
-               new &= ~(1 << BTREE_NODE_dirty);
-               new &= ~(1 << BTREE_NODE_need_write);
-               new |=  (1 << BTREE_NODE_write_in_flight);
-               new |=  (1 << BTREE_NODE_write_in_flight_inner);
-               new |=  (1 << BTREE_NODE_just_written);
-               new ^=  (1 << BTREE_NODE_write_idx);
-       } while (!try_cmpxchg_acquire(&b->flags, &old, new));
-
-       if (new & (1U << BTREE_NODE_need_write))
-               return;
-do_write:
-       BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
-
-       atomic_long_dec(&c->btree_cache.nr_dirty);
-
-       BUG_ON(btree_node_fake(b));
-       BUG_ON((b->will_make_reachable != 0) != !b->written);
-
-       BUG_ON(b->written >= btree_sectors(c));
-       BUG_ON(b->written & (block_sectors(c) - 1));
-       BUG_ON(bset_written(b, btree_bset_last(b)));
-       BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
-       BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
-
-       bch2_sort_whiteouts(c, b);
-
-       sort_iter_stack_init(&sort_iter, b);
-
-       bytes = !b->written
-               ? sizeof(struct btree_node)
-               : sizeof(struct btree_node_entry);
-
-       bytes += b->whiteout_u64s * sizeof(u64);
-
-       for_each_bset(b, t) {
-               i = bset(b, t);
-
-               if (bset_written(b, i))
-                       continue;
-
-               bytes += le16_to_cpu(i->u64s) * sizeof(u64);
-               sort_iter_add(&sort_iter.iter,
-                             btree_bkey_first(b, t),
-                             btree_bkey_last(b, t));
-               seq = max(seq, le64_to_cpu(i->journal_seq));
-       }
-
-       BUG_ON(b->written && !seq);
-
-       /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
-       bytes += 8;
-
-       /* buffer must be a multiple of the block size */
-       bytes = round_up(bytes, block_bytes(c));
-
-       data = btree_bounce_alloc(c, bytes, &used_mempool);
-
-       if (!b->written) {
-               bn = data;
-               *bn = *b->data;
-               i = &bn->keys;
-       } else {
-               bne = data;
-               bne->keys = b->data->keys;
-               i = &bne->keys;
-       }
-
-       i->journal_seq  = cpu_to_le64(seq);
-       i->u64s         = 0;
-
-       sort_iter_add(&sort_iter.iter,
-                     unwritten_whiteouts_start(b),
-                     unwritten_whiteouts_end(b));
-       SET_BSET_SEPARATE_WHITEOUTS(i, false);
-
-       u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
-       le16_add_cpu(&i->u64s, u64s);
-
-       b->whiteout_u64s = 0;
-
-       BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
-
-       set_needs_whiteout(i, false);
-
-       /* do we have data to write? */
-       if (b->written && !i->u64s)
-               goto nowrite;
-
-       bytes_to_write = vstruct_end(i) - data;
-       sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
-
-       if (!b->written &&
-           b->key.k.type == KEY_TYPE_btree_ptr_v2)
-               BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
-
-       memset(data + bytes_to_write, 0,
-              (sectors_to_write << 9) - bytes_to_write);
-
-       BUG_ON(b->written + sectors_to_write > btree_sectors(c));
-       BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
-       BUG_ON(i->seq != b->data->keys.seq);
-
-       i->version = cpu_to_le16(c->sb.version);
-       SET_BSET_OFFSET(i, b->written);
-       SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
-
-       if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
-               validate_before_checksum = true;
-
-       /* validate_bset will be modifying: */
-       if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
-               validate_before_checksum = true;
-
-       /* if we're going to be encrypting, check metadata validity first: */
-       if (validate_before_checksum &&
-           validate_bset_for_write(c, b, i))
-               goto err;
-
-       ret = bset_encrypt(c, i, b->written << 9);
-       if (bch2_fs_fatal_err_on(ret, c,
-                       "encrypting btree node: %s", bch2_err_str(ret)))
-               goto err;
-
-       nonce = btree_nonce(i, b->written << 9);
-
-       if (bn)
-               bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
-       else
-               bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
-       /* if we're not encrypting, check metadata after checksumming: */
-       if (!validate_before_checksum &&
-           validate_bset_for_write(c, b, i))
-               goto err;
-
-       /*
-        * We handle btree write errors by immediately halting the journal -
-        * after we've done that, we can't issue any subsequent btree writes
-        * because they might have pointers to new nodes that failed to write.
-        *
-        * Furthermore, there's no point in doing any more btree writes because
-        * with the journal stopped, we're never going to update the journal to
-        * reflect that those writes were done and the data flushed from the
-        * journal:
-        *
-        * Also on journal error, the pending write may have updates that were
-        * never journalled (interior nodes, see btree_update_nodes_written()) -
-        * it's critical that we don't do the write in that case otherwise we
-        * will have updates visible that weren't in the journal:
-        *
-        * Make sure to update b->written so bch2_btree_init_next() doesn't
-        * break:
-        */
-       if (bch2_journal_error(&c->journal) ||
-           c->opts.nochanges)
-               goto err;
-
-       trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
-
-       wbio = container_of(bio_alloc_bioset(NULL,
-                               buf_pages(data, sectors_to_write << 9),
-                               REQ_OP_WRITE|REQ_META,
-                               GFP_NOFS,
-                               &c->btree_bio),
-                           struct btree_write_bio, wbio.bio);
-       wbio_init(&wbio->wbio.bio);
-       wbio->data                      = data;
-       wbio->data_bytes                = bytes;
-       wbio->sector_offset             = b->written;
-       wbio->start_time                = start_time;
-       wbio->wbio.c                    = c;
-       wbio->wbio.used_mempool         = used_mempool;
-       wbio->wbio.first_btree_write    = !b->written;
-       wbio->wbio.bio.bi_end_io        = btree_node_write_endio;
-       wbio->wbio.bio.bi_private       = b;
-
-       bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
-
-       bkey_copy(&wbio->key, &b->key);
-
-       b->written += sectors_to_write;
-
-       if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
-               bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
-                       cpu_to_le16(b->written);
-
-       atomic64_inc(&c->btree_write_stats[type].nr);
-       atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
-
-       async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx);
-
-       INIT_WORK(&wbio->work, btree_write_submit);
-       queue_work(c->btree_write_submit_wq, &wbio->work);
-       return;
-err:
-       set_btree_node_noevict(b);
-       b->written += sectors_to_write;
-nowrite:
-       btree_bounce_free(c, bytes, used_mempool, data);
-       __btree_node_write_done(c, b, 0);
-}
-
-/*
- * Work that must be done with write lock held:
- */
-bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
-{
-       bool invalidated_iter = false;
-       struct btree_node_entry *bne;
-
-       if (!btree_node_just_written(b))
-               return false;
-
-       BUG_ON(b->whiteout_u64s);
-
-       clear_btree_node_just_written(b);
-
-       /*
-        * Note: immediately after write, bset_written() doesn't work - the
-        * amount of data we had to write after compaction might have been
-        * smaller than the offset of the last bset.
-        *
-        * However, we know that all bsets have been written here, as long as
-        * we're still holding the write lock:
-        */
-
-       /*
-        * XXX: decide if we really want to unconditionally sort down to a
-        * single bset:
-        */
-       if (b->nsets > 1) {
-               btree_node_sort(c, b, 0, b->nsets);
-               invalidated_iter = true;
-       } else {
-               invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
-       }
-
-       for_each_bset(b, t)
-               set_needs_whiteout(bset(b, t), true);
-
-       bch2_btree_verify(c, b);
-
-       /*
-        * If later we don't unconditionally sort down to a single bset, we have
-        * to ensure this is still true:
-        */
-       BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
-
-       bne = want_new_bset(c, b);
-       if (bne)
-               bch2_bset_init_next(b, bne);
-
-       bch2_btree_build_aux_trees(b);
-
-       return invalidated_iter;
-}
-
-/*
- * Use this one if the node is intent locked:
- */
-void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-                          enum six_lock_type lock_type_held,
-                          unsigned flags)
-{
-       if (lock_type_held == SIX_LOCK_intent ||
-           (lock_type_held == SIX_LOCK_read &&
-            six_lock_tryupgrade(&b->c.lock))) {
-               __bch2_btree_node_write(c, b, flags);
-
-               /* don't cycle lock unnecessarily: */
-               if (btree_node_just_written(b) &&
-                   six_trylock_write(&b->c.lock)) {
-                       bch2_btree_post_write_cleanup(c, b);
-                       six_unlock_write(&b->c.lock);
-               }
-
-               if (lock_type_held == SIX_LOCK_read)
-                       six_lock_downgrade(&b->c.lock);
-       } else {
-               __bch2_btree_node_write(c, b, flags);
-               if (lock_type_held == SIX_LOCK_write &&
-                   btree_node_just_written(b))
-                       bch2_btree_post_write_cleanup(c, b);
-       }
-}
-
-void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b,
-                                enum six_lock_type lock_type_held,
-                                unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-
-       if (lock_type_held == SIX_LOCK_intent ||
-           (lock_type_held == SIX_LOCK_read &&
-            six_lock_tryupgrade(&b->c.lock))) {
-               __bch2_btree_node_write(c, b, flags);
-
-               /* don't cycle lock unnecessarily: */
-               if (btree_node_just_written(b) &&
-                   six_trylock_write(&b->c.lock)) {
-                       bch2_btree_post_write_cleanup(c, b);
-                       __bch2_btree_node_unlock_write(trans, b);
-               }
-
-               if (lock_type_held == SIX_LOCK_read)
-                       six_lock_downgrade(&b->c.lock);
-       } else {
-               __bch2_btree_node_write(c, b, flags);
-               if (lock_type_held == SIX_LOCK_write &&
-                   btree_node_just_written(b))
-                       bch2_btree_post_write_cleanup(c, b);
-       }
-}
-
-static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
-{
-       struct bucket_table *tbl;
-       struct rhash_head *pos;
-       struct btree *b;
-       unsigned i;
-       bool ret = false;
-restart:
-       rcu_read_lock();
-       for_each_cached_btree(b, c, tbl, i, pos)
-               if (test_bit(flag, &b->flags)) {
-                       rcu_read_unlock();
-                       wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
-                       ret = true;
-                       goto restart;
-               }
-       rcu_read_unlock();
-
-       return ret;
-}
-
-bool bch2_btree_flush_all_reads(struct bch_fs *c)
-{
-       return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
-}
-
-bool bch2_btree_flush_all_writes(struct bch_fs *c)
-{
-       return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
-}
-
-static const char * const bch2_btree_write_types[] = {
-#define x(t, n) [n] = #t,
-       BCH_BTREE_WRITE_TYPES()
-       NULL
-};
-
-void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       printbuf_tabstop_push(out, 20);
-       printbuf_tabstop_push(out, 10);
-
-       prt_printf(out, "\tnr\tsize\n");
-
-       for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
-               u64 nr          = atomic64_read(&c->btree_write_stats[i].nr);
-               u64 bytes       = atomic64_read(&c->btree_write_stats[i].bytes);
-
-               prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr);
-               prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
-               prt_newline(out);
-       }
-}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
deleted file mode 100644 (file)
index 30a5180..0000000
+++ /dev/null
@@ -1,239 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_IO_H
-#define _BCACHEFS_BTREE_IO_H
-
-#include "bkey_methods.h"
-#include "bset.h"
-#include "btree_locking.h"
-#include "checksum.h"
-#include "extents.h"
-#include "io_write_types.h"
-
-struct bch_fs;
-struct btree_write;
-struct btree;
-struct btree_iter;
-struct btree_node_read_all;
-
-static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
-{
-       if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
-               atomic_long_inc(&c->btree_cache.nr_dirty);
-}
-
-static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
-{
-       if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
-               atomic_long_dec(&c->btree_cache.nr_dirty);
-}
-
-static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
-{
-       return k.k->type == KEY_TYPE_btree_ptr_v2
-               ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
-               : 0;
-}
-
-struct btree_read_bio {
-       struct bch_fs           *c;
-       struct btree            *b;
-       struct btree_node_read_all *ra;
-       u64                     start_time;
-       unsigned                have_ioref:1;
-       unsigned                idx:7;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-       unsigned                list_idx;
-#endif
-       struct extent_ptr_decoded       pick;
-       struct work_struct      work;
-       struct bio              bio;
-};
-
-struct btree_write_bio {
-       struct work_struct      work;
-       __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-       void                    *data;
-       unsigned                data_bytes;
-       unsigned                sector_offset;
-       u64                     start_time;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-       unsigned                list_idx;
-#endif
-       struct bch_write_bio    wbio;
-};
-
-void bch2_btree_node_io_unlock(struct btree *);
-void bch2_btree_node_io_lock(struct btree *);
-void __bch2_btree_node_wait_on_read(struct btree *);
-void __bch2_btree_node_wait_on_write(struct btree *);
-void bch2_btree_node_wait_on_read(struct btree *);
-void bch2_btree_node_wait_on_write(struct btree *);
-
-enum compact_mode {
-       COMPACT_LAZY,
-       COMPACT_ALL,
-};
-
-bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
-                           enum compact_mode);
-
-static inline bool should_compact_bset_lazy(struct btree *b,
-                                           struct bset_tree *t)
-{
-       unsigned total_u64s = bset_u64s(t);
-       unsigned dead_u64s = bset_dead_u64s(b, t);
-
-       return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
-}
-
-static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
-{
-       for_each_bset(b, t)
-               if (should_compact_bset_lazy(b, t))
-                       return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
-
-       return false;
-}
-
-static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
-{
-       return (struct nonce) {{
-               [0] = cpu_to_le32(offset),
-               [1] = ((__le32 *) &i->seq)[0],
-               [2] = ((__le32 *) &i->seq)[1],
-               [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
-       }};
-}
-
-static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
-{
-       struct nonce nonce = btree_nonce(i, offset);
-       int ret;
-
-       if (!offset) {
-               struct btree_node *bn = container_of(i, struct btree_node, keys);
-               unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
-               ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
-                                  &bn->flags, bytes);
-               if (ret)
-                       return ret;
-
-               nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
-       }
-
-       return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
-                           vstruct_end(i) - (void *) i->_data);
-}
-
-void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
-
-void bch2_btree_node_drop_keys_outside_node(struct btree *);
-
-void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct btree_trans *, struct btree *);
-
-int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
-                             struct btree *,
-                             struct bch_io_failures *,
-                             struct printbuf *);
-void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
-int bch2_btree_root_read(struct bch_fs *, enum btree_id,
-                        const struct bkey_i *, unsigned);
-
-void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *);
-
-int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned,
-                         struct bkey_s_c, unsigned);
-
-bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
-
-enum btree_write_flags {
-       __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
-       __BTREE_WRITE_ALREADY_STARTED,
-};
-#define BTREE_WRITE_ONLY_IF_NEED       BIT(__BTREE_WRITE_ONLY_IF_NEED)
-#define BTREE_WRITE_ALREADY_STARTED    BIT(__BTREE_WRITE_ALREADY_STARTED)
-
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
-void bch2_btree_node_write(struct bch_fs *, struct btree *,
-                          enum six_lock_type, unsigned);
-void bch2_btree_node_write_trans(struct btree_trans *, struct btree *,
-                                enum six_lock_type, unsigned);
-
-static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b,
-                                           enum six_lock_type lock_held)
-{
-       bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
-}
-
-bool bch2_btree_flush_all_reads(struct bch_fs *);
-bool bch2_btree_flush_all_writes(struct bch_fs *);
-
-static inline void compat_bformat(unsigned level, enum btree_id btree_id,
-                                 unsigned version, unsigned big_endian,
-                                 int write, struct bkey_format *f)
-{
-       if (version < bcachefs_metadata_version_inode_btree_change &&
-           btree_id == BTREE_ID_inodes) {
-               swap(f->bits_per_field[BKEY_FIELD_INODE],
-                    f->bits_per_field[BKEY_FIELD_OFFSET]);
-               swap(f->field_offset[BKEY_FIELD_INODE],
-                    f->field_offset[BKEY_FIELD_OFFSET]);
-       }
-
-       if (version < bcachefs_metadata_version_snapshot &&
-           (level || btree_type_has_snapshots(btree_id))) {
-               u64 max_packed =
-                       ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
-
-               f->field_offset[BKEY_FIELD_SNAPSHOT] = write
-                       ? 0
-                       : cpu_to_le64(U32_MAX - max_packed);
-       }
-}
-
-static inline void compat_bpos(unsigned level, enum btree_id btree_id,
-                              unsigned version, unsigned big_endian,
-                              int write, struct bpos *p)
-{
-       if (big_endian != CPU_BIG_ENDIAN)
-               bch2_bpos_swab(p);
-
-       if (version < bcachefs_metadata_version_inode_btree_change &&
-           btree_id == BTREE_ID_inodes)
-               swap(p->inode, p->offset);
-}
-
-static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
-                                    unsigned version, unsigned big_endian,
-                                    int write,
-                                    struct btree_node *bn)
-{
-       if (version < bcachefs_metadata_version_inode_btree_change &&
-           btree_id_is_extents(btree_id) &&
-           !bpos_eq(bn->min_key, POS_MIN) &&
-           write)
-               bn->min_key = bpos_nosnap_predecessor(bn->min_key);
-
-       if (version < bcachefs_metadata_version_snapshot &&
-           write)
-               bn->max_key.snapshot = 0;
-
-       compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
-       compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
-
-       if (version < bcachefs_metadata_version_snapshot &&
-           !write)
-               bn->max_key.snapshot = U32_MAX;
-
-       if (version < bcachefs_metadata_version_inode_btree_change &&
-           btree_id_is_extents(btree_id) &&
-           !bpos_eq(bn->min_key, POS_MIN) &&
-           !write)
-               bn->min_key = bpos_nosnap_successor(bn->min_key);
-}
-
-void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
deleted file mode 100644 (file)
index f8829b6..0000000
+++ /dev/null
@@ -1,3804 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "replicas.h"
-#include "snapshot.h"
-#include "super.h"
-#include "trace.h"
-
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
-static inline void btree_path_list_add(struct btree_trans *,
-                       btree_path_idx_t, btree_path_idx_t);
-
-static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
-{
-#ifdef TRACK_PATH_ALLOCATED
-       return iter->ip_allocated;
-#else
-       return 0;
-#endif
-}
-
-static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
-static void bch2_trans_srcu_lock(struct btree_trans *);
-
-static inline int __btree_path_cmp(const struct btree_path *l,
-                                  enum btree_id        r_btree_id,
-                                  bool                 r_cached,
-                                  struct bpos          r_pos,
-                                  unsigned             r_level)
-{
-       /*
-        * Must match lock ordering as defined by __bch2_btree_node_lock:
-        */
-       return   cmp_int(l->btree_id,   r_btree_id) ?:
-                cmp_int((int) l->cached,       (int) r_cached) ?:
-                bpos_cmp(l->pos,       r_pos) ?:
-               -cmp_int(l->level,      r_level);
-}
-
-static inline int btree_path_cmp(const struct btree_path *l,
-                                const struct btree_path *r)
-{
-       return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
-}
-
-static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
-{
-       /* Are we iterating over keys in all snapshots? */
-       if (iter->flags & BTREE_ITER_all_snapshots) {
-               p = bpos_successor(p);
-       } else {
-               p = bpos_nosnap_successor(p);
-               p.snapshot = iter->snapshot;
-       }
-
-       return p;
-}
-
-static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
-{
-       /* Are we iterating over keys in all snapshots? */
-       if (iter->flags & BTREE_ITER_all_snapshots) {
-               p = bpos_predecessor(p);
-       } else {
-               p = bpos_nosnap_predecessor(p);
-               p.snapshot = iter->snapshot;
-       }
-
-       return p;
-}
-
-static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
-{
-       struct bpos pos = iter->pos;
-
-       if ((iter->flags & BTREE_ITER_is_extents) &&
-           !bkey_eq(pos, POS_MAX))
-               pos = bkey_successor(iter, pos);
-       return pos;
-}
-
-static inline bool btree_path_pos_before_node(struct btree_path *path,
-                                             struct btree *b)
-{
-       return bpos_lt(path->pos, b->data->min_key);
-}
-
-static inline bool btree_path_pos_after_node(struct btree_path *path,
-                                            struct btree *b)
-{
-       return bpos_gt(path->pos, b->key.k.p);
-}
-
-static inline bool btree_path_pos_in_node(struct btree_path *path,
-                                         struct btree *b)
-{
-       return path->btree_id == b->c.btree_id &&
-               !btree_path_pos_before_node(path, b) &&
-               !btree_path_pos_after_node(path, b);
-}
-
-/* Debug: */
-
-static void __bch2_btree_path_verify_cached(struct btree_trans *trans,
-                                         struct btree_path *path)
-{
-       struct bkey_cached *ck;
-       bool locked = btree_node_locked(path, 0);
-
-       if (!bch2_btree_node_relock(trans, path, 0))
-               return;
-
-       ck = (void *) path->l[0].b;
-       BUG_ON(ck->key.btree_id != path->btree_id ||
-              !bkey_eq(ck->key.pos, path->pos));
-
-       if (!locked)
-               btree_node_unlock(trans, path, 0);
-}
-
-static void __bch2_btree_path_verify_level(struct btree_trans *trans,
-                               struct btree_path *path, unsigned level)
-{
-       struct btree_path_level *l;
-       struct btree_node_iter tmp;
-       bool locked;
-       struct bkey_packed *p, *k;
-       struct printbuf buf1 = PRINTBUF;
-       struct printbuf buf2 = PRINTBUF;
-       struct printbuf buf3 = PRINTBUF;
-       const char *msg;
-
-       l       = &path->l[level];
-       tmp     = l->iter;
-       locked  = btree_node_locked(path, level);
-
-       if (path->cached) {
-               if (!level)
-                       __bch2_btree_path_verify_cached(trans, path);
-               return;
-       }
-
-       if (!btree_path_node(path, level))
-               return;
-
-       if (!bch2_btree_node_relock_notrace(trans, path, level))
-               return;
-
-       BUG_ON(!btree_path_pos_in_node(path, l->b));
-
-       bch2_btree_node_iter_verify(&l->iter, l->b);
-
-       /*
-        * For interior nodes, the iterator will have skipped past deleted keys:
-        */
-       p = level
-               ? bch2_btree_node_iter_prev(&tmp, l->b)
-               : bch2_btree_node_iter_prev_all(&tmp, l->b);
-       k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-
-       if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
-               msg = "before";
-               goto err;
-       }
-
-       if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
-               msg = "after";
-               goto err;
-       }
-
-       if (!locked)
-               btree_node_unlock(trans, path, level);
-       return;
-err:
-       bch2_bpos_to_text(&buf1, path->pos);
-
-       if (p) {
-               struct bkey uk = bkey_unpack_key(l->b, p);
-
-               bch2_bkey_to_text(&buf2, &uk);
-       } else {
-               prt_printf(&buf2, "(none)");
-       }
-
-       if (k) {
-               struct bkey uk = bkey_unpack_key(l->b, k);
-
-               bch2_bkey_to_text(&buf3, &uk);
-       } else {
-               prt_printf(&buf3, "(none)");
-       }
-
-       panic("path should be %s key at level %u:\n"
-             "path pos %s\n"
-             "prev key %s\n"
-             "cur  key %s\n",
-             msg, level, buf1.buf, buf2.buf, buf3.buf);
-}
-
-static void __bch2_btree_path_verify(struct btree_trans *trans,
-                                  struct btree_path *path)
-{
-       struct bch_fs *c = trans->c;
-
-       for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
-               if (!path->l[i].b) {
-                       BUG_ON(!path->cached &&
-                              bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
-                       break;
-               }
-
-               __bch2_btree_path_verify_level(trans, path, i);
-       }
-
-       bch2_btree_path_verify_locks(trans, path);
-}
-
-void __bch2_trans_verify_paths(struct btree_trans *trans)
-{
-       struct btree_path *path;
-       unsigned iter;
-
-       trans_for_each_path(trans, path, iter)
-               __bch2_btree_path_verify(trans, path);
-}
-
-static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
-{
-       BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
-
-       BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
-              (iter->flags & BTREE_ITER_all_snapshots));
-
-       BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
-              (iter->flags & BTREE_ITER_all_snapshots) &&
-              !btree_type_has_snapshot_field(iter->btree_id));
-
-       if (iter->update_path)
-               __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
-       __bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
-}
-
-static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
-{
-       BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
-              !iter->pos.snapshot);
-
-       BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
-              iter->pos.snapshot != iter->snapshot);
-
-       BUG_ON(iter->flags & BTREE_ITER_all_snapshots   ? !bpos_eq(iter->pos, iter->k.p) :
-              !(iter->flags & BTREE_ITER_is_extents)   ? !bkey_eq(iter->pos, iter->k.p) :
-              (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
-               bkey_gt(iter->pos, iter->k.p)));
-}
-
-static int __bch2_btree_iter_verify_ret(struct btree_trans *trans,
-                                       struct btree_iter *iter, struct bkey_s_c k)
-{
-       struct btree_iter copy;
-       struct bkey_s_c prev;
-       int ret = 0;
-
-       if (!(iter->flags & BTREE_ITER_filter_snapshots))
-               return 0;
-
-       if (bkey_err(k) || !k.k)
-               return 0;
-
-       BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
-                                         iter->snapshot,
-                                         k.k->p.snapshot));
-
-       bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
-                            BTREE_ITER_nopreserve|
-                            BTREE_ITER_all_snapshots);
-       prev = bch2_btree_iter_prev(trans, &copy);
-       if (!prev.k)
-               goto out;
-
-       ret = bkey_err(prev);
-       if (ret)
-               goto out;
-
-       if (bkey_eq(prev.k->p, k.k->p) &&
-           bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
-                                     prev.k->p.snapshot) > 0) {
-               struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
-               bch2_bkey_to_text(&buf1, k.k);
-               bch2_bkey_to_text(&buf2, prev.k);
-
-               panic("iter snap %u\n"
-                     "k    %s\n"
-                     "prev %s\n",
-                     iter->snapshot,
-                     buf1.buf, buf2.buf);
-       }
-out:
-       bch2_trans_iter_exit(trans, &copy);
-       return ret;
-}
-
-void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
-                           struct bpos pos)
-{
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-       struct btree_path *path;
-       struct trans_for_each_path_inorder_iter iter;
-       struct printbuf buf = PRINTBUF;
-
-       btree_trans_sort_paths(trans);
-
-       trans_for_each_path_inorder(trans, path, iter) {
-               if (path->btree_id != id ||
-                   !btree_node_locked(path, 0) ||
-                   !path->should_be_locked)
-                       continue;
-
-               if (!path->cached) {
-                       if (bkey_ge(pos, path->l[0].b->data->min_key) &&
-                           bkey_le(pos, path->l[0].b->key.k.p))
-                               return;
-               } else {
-                       if (bkey_eq(pos, path->pos))
-                               return;
-               }
-       }
-
-       bch2_dump_trans_paths_updates(trans);
-       bch2_bpos_to_text(&buf, pos);
-
-       panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
-}
-
-static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
-                                               struct btree_path *path, unsigned l)
-{
-       if (static_branch_unlikely(&bch2_debug_check_iterators))
-               __bch2_btree_path_verify_level(trans, path, l);
-}
-
-static inline void bch2_btree_path_verify(struct btree_trans *trans,
-                                         struct btree_path *path)
-{
-       if (static_branch_unlikely(&bch2_debug_check_iterators))
-               __bch2_btree_path_verify(trans, path);
-}
-
-static inline void bch2_btree_iter_verify(struct btree_trans *trans,
-                                         struct btree_iter *iter)
-{
-       if (static_branch_unlikely(&bch2_debug_check_iterators))
-               __bch2_btree_iter_verify(trans, iter);
-}
-
-static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
-{
-       if (static_branch_unlikely(&bch2_debug_check_iterators))
-               __bch2_btree_iter_verify_entry_exit(iter);
-}
-
-static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
-                                            struct bkey_s_c k)
-{
-       return static_branch_unlikely(&bch2_debug_check_iterators)
-               ? __bch2_btree_iter_verify_ret(trans, iter, k)
-               : 0;
-}
-
-/* Btree path: fixups after btree updates */
-
-static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
-                                       struct btree *b,
-                                       struct bset_tree *t,
-                                       struct bkey_packed *k)
-{
-       struct btree_node_iter_set *set;
-
-       btree_node_iter_for_each(iter, set)
-               if (set->end == t->end_offset) {
-                       set->k = __btree_node_key_to_offset(b, k);
-                       bch2_btree_node_iter_sort(iter, b);
-                       return;
-               }
-
-       bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
-}
-
-static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
-                                              struct btree *b,
-                                              struct bkey_packed *where)
-{
-       struct btree_path_level *l = &path->l[b->c.level];
-
-       if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
-               return;
-
-       if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
-               bch2_btree_node_iter_advance(&l->iter, l->b);
-}
-
-void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
-                                     struct btree *b,
-                                     struct bkey_packed *where)
-{
-       struct btree_path *path;
-       unsigned i;
-
-       trans_for_each_path_with_node(trans, b, path, i) {
-               __bch2_btree_path_fix_key_modified(path, b, where);
-               bch2_btree_path_verify_level(trans, path, b->c.level);
-       }
-}
-
-static void __bch2_btree_node_iter_fix(struct btree_path *path,
-                                      struct btree *b,
-                                      struct btree_node_iter *node_iter,
-                                      struct bset_tree *t,
-                                      struct bkey_packed *where,
-                                      unsigned clobber_u64s,
-                                      unsigned new_u64s)
-{
-       const struct bkey_packed *end = btree_bkey_last(b, t);
-       struct btree_node_iter_set *set;
-       unsigned offset = __btree_node_key_to_offset(b, where);
-       int shift = new_u64s - clobber_u64s;
-       unsigned old_end = t->end_offset - shift;
-       unsigned orig_iter_pos = node_iter->data[0].k;
-       bool iter_current_key_modified =
-               orig_iter_pos >= offset &&
-               orig_iter_pos <= offset + clobber_u64s;
-
-       btree_node_iter_for_each(node_iter, set)
-               if (set->end == old_end)
-                       goto found;
-
-       /* didn't find the bset in the iterator - might have to readd it: */
-       if (new_u64s &&
-           bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
-               bch2_btree_node_iter_push(node_iter, b, where, end);
-               goto fixup_done;
-       } else {
-               /* Iterator is after key that changed */
-               return;
-       }
-found:
-       set->end = t->end_offset;
-
-       /* Iterator hasn't gotten to the key that changed yet: */
-       if (set->k < offset)
-               return;
-
-       if (new_u64s &&
-           bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
-               set->k = offset;
-       } else if (set->k < offset + clobber_u64s) {
-               set->k = offset + new_u64s;
-               if (set->k == set->end)
-                       bch2_btree_node_iter_set_drop(node_iter, set);
-       } else {
-               /* Iterator is after key that changed */
-               set->k = (int) set->k + shift;
-               return;
-       }
-
-       bch2_btree_node_iter_sort(node_iter, b);
-fixup_done:
-       if (node_iter->data[0].k != orig_iter_pos)
-               iter_current_key_modified = true;
-
-       /*
-        * When a new key is added, and the node iterator now points to that
-        * key, the iterator might have skipped past deleted keys that should
-        * come after the key the iterator now points to. We have to rewind to
-        * before those deleted keys - otherwise
-        * bch2_btree_node_iter_prev_all() breaks:
-        */
-       if (!bch2_btree_node_iter_end(node_iter) &&
-           iter_current_key_modified &&
-           b->c.level) {
-               struct bkey_packed *k, *k2, *p;
-
-               k = bch2_btree_node_iter_peek_all(node_iter, b);
-
-               for_each_bset(b, t) {
-                       bool set_pos = false;
-
-                       if (node_iter->data[0].end == t->end_offset)
-                               continue;
-
-                       k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-
-                       while ((p = bch2_bkey_prev_all(b, t, k2)) &&
-                              bkey_iter_cmp(b, k, p) < 0) {
-                               k2 = p;
-                               set_pos = true;
-                       }
-
-                       if (set_pos)
-                               btree_node_iter_set_set_pos(node_iter,
-                                                           b, t, k2);
-               }
-       }
-}
-
-void bch2_btree_node_iter_fix(struct btree_trans *trans,
-                             struct btree_path *path,
-                             struct btree *b,
-                             struct btree_node_iter *node_iter,
-                             struct bkey_packed *where,
-                             unsigned clobber_u64s,
-                             unsigned new_u64s)
-{
-       struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
-       struct btree_path *linked;
-       unsigned i;
-
-       if (node_iter != &path->l[b->c.level].iter) {
-               __bch2_btree_node_iter_fix(path, b, node_iter, t,
-                                          where, clobber_u64s, new_u64s);
-
-               if (static_branch_unlikely(&bch2_debug_check_iterators))
-                       bch2_btree_node_iter_verify(node_iter, b);
-       }
-
-       trans_for_each_path_with_node(trans, b, linked, i) {
-               __bch2_btree_node_iter_fix(linked, b,
-                                          &linked->l[b->c.level].iter, t,
-                                          where, clobber_u64s, new_u64s);
-               bch2_btree_path_verify_level(trans, linked, b->c.level);
-       }
-}
-
-/* Btree path level: pointer to a particular btree node and node iter */
-
-static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
-                                                 struct btree_path_level *l,
-                                                 struct bkey *u,
-                                                 struct bkey_packed *k)
-{
-       if (unlikely(!k)) {
-               /*
-                * signal to bch2_btree_iter_peek_slot() that we're currently at
-                * a hole
-                */
-               u->type = KEY_TYPE_deleted;
-               return bkey_s_c_null;
-       }
-
-       return bkey_disassemble(l->b, k, u);
-}
-
-static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
-                                                       struct btree_path_level *l,
-                                                       struct bkey *u)
-{
-       return __btree_iter_unpack(c, l, u,
-                       bch2_btree_node_iter_peek_all(&l->iter, l->b));
-}
-
-static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
-                                                   struct btree_path *path,
-                                                   struct btree_path_level *l,
-                                                   struct bkey *u)
-{
-       struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
-                       bch2_btree_node_iter_prev(&l->iter, l->b));
-
-       path->pos = k.k ? k.k->p : l->b->data->min_key;
-       trans->paths_sorted = false;
-       bch2_btree_path_verify_level(trans, path, l - path->l);
-       return k;
-}
-
-static inline bool btree_path_advance_to_pos(struct btree_path *path,
-                                            struct btree_path_level *l,
-                                            int max_advance)
-{
-       struct bkey_packed *k;
-       int nr_advanced = 0;
-
-       while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-              bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
-               if (max_advance > 0 && nr_advanced >= max_advance)
-                       return false;
-
-               bch2_btree_node_iter_advance(&l->iter, l->b);
-               nr_advanced++;
-       }
-
-       return true;
-}
-
-static inline void __btree_path_level_init(struct btree_path *path,
-                                          unsigned level)
-{
-       struct btree_path_level *l = &path->l[level];
-
-       bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
-
-       /*
-        * Iterators to interior nodes should always be pointed at the first non
-        * whiteout:
-        */
-       if (level)
-               bch2_btree_node_iter_peek(&l->iter, l->b);
-}
-
-void bch2_btree_path_level_init(struct btree_trans *trans,
-                               struct btree_path *path,
-                               struct btree *b)
-{
-       BUG_ON(path->cached);
-
-       EBUG_ON(!btree_path_pos_in_node(path, b));
-
-       path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
-       path->l[b->c.level].b = b;
-       __btree_path_level_init(path, b->c.level);
-}
-
-/* Btree path: fixups after btree node updates: */
-
-static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
-{
-       struct bch_fs *c = trans->c;
-
-       trans_for_each_update(trans, i)
-               if (!i->cached &&
-                   i->level    == b->c.level &&
-                   i->btree_id == b->c.btree_id &&
-                   bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
-                   bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
-                       i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
-
-                       if (unlikely(trans->journal_replay_not_finished)) {
-                               struct bkey_i *j_k =
-                                       bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
-                                                                   i->k->k.p);
-
-                               if (j_k) {
-                                       i->old_k = j_k->k;
-                                       i->old_v = &j_k->v;
-                               }
-                       }
-               }
-}
-
-/*
- * A btree node is being replaced - update the iterator to point to the new
- * node:
- */
-void bch2_trans_node_add(struct btree_trans *trans,
-                        struct btree_path *path,
-                        struct btree *b)
-{
-       struct btree_path *prev;
-
-       BUG_ON(!btree_path_pos_in_node(path, b));
-
-       while ((prev = prev_btree_path(trans, path)) &&
-              btree_path_pos_in_node(prev, b))
-               path = prev;
-
-       for (;
-            path && btree_path_pos_in_node(path, b);
-            path = next_btree_path(trans, path))
-               if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
-                       enum btree_node_locked_type t =
-                               btree_lock_want(path, b->c.level);
-
-                       if (t != BTREE_NODE_UNLOCKED) {
-                               btree_node_unlock(trans, path, b->c.level);
-                               six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-                               mark_btree_node_locked(trans, path, b->c.level, t);
-                       }
-
-                       bch2_btree_path_level_init(trans, path, b);
-               }
-
-       bch2_trans_revalidate_updates_in_node(trans, b);
-}
-
-void bch2_trans_node_drop(struct btree_trans *trans,
-                         struct btree *b)
-{
-       struct btree_path *path;
-       unsigned i, level = b->c.level;
-
-       trans_for_each_path(trans, path, i)
-               if (path->l[level].b == b) {
-                       btree_node_unlock(trans, path, level);
-                       path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
-               }
-}
-
-/*
- * A btree node has been modified in such a way as to invalidate iterators - fix
- * them:
- */
-void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
-{
-       struct btree_path *path;
-       unsigned i;
-
-       trans_for_each_path_with_node(trans, b, path, i)
-               __btree_path_level_init(path, b->c.level);
-
-       bch2_trans_revalidate_updates_in_node(trans, b);
-}
-
-/* Btree path: traverse, set_pos: */
-
-static inline int btree_path_lock_root(struct btree_trans *trans,
-                                      struct btree_path *path,
-                                      unsigned depth_want,
-                                      unsigned long trace_ip)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
-       enum six_lock_type lock_type;
-       unsigned i;
-       int ret;
-
-       EBUG_ON(path->nodes_locked);
-
-       while (1) {
-               struct btree *b = READ_ONCE(r->b);
-               if (unlikely(!b)) {
-                       BUG_ON(!r->error);
-                       return r->error;
-               }
-
-               path->level = READ_ONCE(b->c.level);
-
-               if (unlikely(path->level < depth_want)) {
-                       /*
-                        * the root is at a lower depth than the depth we want:
-                        * got to the end of the btree, or we're walking nodes
-                        * greater than some depth and there are no nodes >=
-                        * that depth
-                        */
-                       path->level = depth_want;
-                       for (i = path->level; i < BTREE_MAX_DEPTH; i++)
-                               path->l[i].b = NULL;
-                       return 1;
-               }
-
-               lock_type = __btree_lock_want(path, path->level);
-               ret = btree_node_lock(trans, path, &b->c,
-                                     path->level, lock_type, trace_ip);
-               if (unlikely(ret)) {
-                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                               return ret;
-                       BUG();
-               }
-
-               if (likely(b == READ_ONCE(r->b) &&
-                          b->c.level == path->level &&
-                          !race_fault())) {
-                       for (i = 0; i < path->level; i++)
-                               path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
-                       path->l[path->level].b = b;
-                       for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
-                               path->l[i].b = NULL;
-
-                       mark_btree_node_locked(trans, path, path->level,
-                                              (enum btree_node_locked_type) lock_type);
-                       bch2_btree_path_level_init(trans, path, b);
-                       return 0;
-               }
-
-               six_unlock_type(&b->c.lock, lock_type);
-       }
-}
-
-noinline
-static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_path_level *l = path_l(path);
-       struct btree_node_iter node_iter = l->iter;
-       struct bkey_packed *k;
-       struct bkey_buf tmp;
-       unsigned nr = test_bit(BCH_FS_started, &c->flags)
-               ? (path->level > 1 ? 0 :  2)
-               : (path->level > 1 ? 1 : 16);
-       bool was_locked = btree_node_locked(path, path->level);
-       int ret = 0;
-
-       bch2_bkey_buf_init(&tmp);
-
-       while (nr-- && !ret) {
-               if (!bch2_btree_node_relock(trans, path, path->level))
-                       break;
-
-               bch2_btree_node_iter_advance(&node_iter, l->b);
-               k = bch2_btree_node_iter_peek(&node_iter, l->b);
-               if (!k)
-                       break;
-
-               bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-               ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
-                                              path->level - 1);
-       }
-
-       if (!was_locked)
-               btree_node_unlock(trans, path, path->level);
-
-       bch2_bkey_buf_exit(&tmp, c);
-       return ret;
-}
-
-static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
-                                struct btree_and_journal_iter *jiter)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_s_c k;
-       struct bkey_buf tmp;
-       unsigned nr = test_bit(BCH_FS_started, &c->flags)
-               ? (path->level > 1 ? 0 :  2)
-               : (path->level > 1 ? 1 : 16);
-       bool was_locked = btree_node_locked(path, path->level);
-       int ret = 0;
-
-       bch2_bkey_buf_init(&tmp);
-
-       jiter->fail_if_too_many_whiteouts = true;
-
-       while (nr-- && !ret) {
-               if (!bch2_btree_node_relock(trans, path, path->level))
-                       break;
-
-               bch2_btree_and_journal_iter_advance(jiter);
-               k = bch2_btree_and_journal_iter_peek(jiter);
-               if (!k.k)
-                       break;
-
-               bch2_bkey_buf_reassemble(&tmp, c, k);
-               ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
-                                              path->level - 1);
-       }
-
-       if (!was_locked)
-               btree_node_unlock(trans, path, path->level);
-
-       bch2_bkey_buf_exit(&tmp, c);
-       return ret;
-}
-
-static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
-                                           struct btree_path *path,
-                                           unsigned plevel, struct btree *b)
-{
-       struct btree_path_level *l = &path->l[plevel];
-       bool locked = btree_node_locked(path, plevel);
-       struct bkey_packed *k;
-       struct bch_btree_ptr_v2 *bp;
-
-       if (!bch2_btree_node_relock(trans, path, plevel))
-               return;
-
-       k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-       BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
-
-       bp = (void *) bkeyp_val(&l->b->format, k);
-       bp->mem_ptr = (unsigned long)b;
-
-       if (!locked)
-               btree_node_unlock(trans, path, plevel);
-}
-
-static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
-                                                    struct btree_path *path,
-                                                    unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_path_level *l = path_l(path);
-       struct btree_and_journal_iter jiter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
-
-       k = bch2_btree_and_journal_iter_peek(&jiter);
-       if (!k.k) {
-               struct printbuf buf = PRINTBUF;
-
-               prt_str(&buf, "node not found at pos ");
-               bch2_bpos_to_text(&buf, path->pos);
-               prt_str(&buf, " at btree ");
-               bch2_btree_pos_to_text(&buf, c, l->b);
-
-               ret = bch2_fs_topology_error(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-               goto err;
-       }
-
-       bkey_reassemble(&trans->btree_path_down, k);
-
-       if ((flags & BTREE_ITER_prefetch) &&
-           c->opts.btree_node_prefetch)
-               ret = btree_path_prefetch_j(trans, path, &jiter);
-
-err:
-       bch2_btree_and_journal_iter_exit(&jiter);
-       return ret;
-}
-
-static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
-                                                    struct btree_path *path)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-
-       prt_str(&buf, "node not found at pos ");
-       bch2_bpos_to_text(&buf, path->pos);
-       prt_str(&buf, " within parent node ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
-
-       bch2_fs_fatal_error(c, "%s", buf.buf);
-       printbuf_exit(&buf);
-       return bch_err_throw(c, btree_need_topology_repair);
-}
-
-static __always_inline int btree_path_down(struct btree_trans *trans,
-                                          struct btree_path *path,
-                                          unsigned flags,
-                                          unsigned long trace_ip)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_path_level *l = path_l(path);
-       struct btree *b;
-       unsigned level = path->level - 1;
-       enum six_lock_type lock_type = __btree_lock_want(path, level);
-       int ret;
-
-       EBUG_ON(!btree_node_locked(path, path->level));
-
-       if (unlikely(trans->journal_replay_not_finished)) {
-               ret = btree_node_iter_and_journal_peek(trans, path, flags);
-               if (ret)
-                       return ret;
-       } else {
-               struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
-               if (unlikely(!k))
-                       return btree_node_missing_err(trans, path);
-
-               bch2_bkey_unpack(l->b, &trans->btree_path_down, k);
-
-               if (unlikely((flags & BTREE_ITER_prefetch)) &&
-                   c->opts.btree_node_prefetch) {
-                       ret = btree_path_prefetch(trans, path);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       b = bch2_btree_node_get(trans, path, &trans->btree_path_down,
-                               level, lock_type, trace_ip);
-       ret = PTR_ERR_OR_ZERO(b);
-       if (unlikely(ret))
-               return ret;
-
-       if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) &&
-           likely(!trans->journal_replay_not_finished &&
-                  trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2))
-               btree_node_mem_ptr_set(trans, path, level + 1, b);
-
-       if (btree_node_read_locked(path, level + 1))
-               btree_node_unlock(trans, path, level + 1);
-
-       mark_btree_node_locked(trans, path, level,
-                              (enum btree_node_locked_type) lock_type);
-       path->level = level;
-       bch2_btree_path_level_init(trans, path, b);
-
-       bch2_btree_path_verify_locks(trans, path);
-       return 0;
-}
-
-static int bch2_btree_path_traverse_all(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_path *path;
-       unsigned long trace_ip = _RET_IP_;
-       unsigned i;
-       int ret = 0;
-
-       if (trans->in_traverse_all)
-               return bch_err_throw(c, transaction_restart_in_traverse_all);
-
-       trans->in_traverse_all = true;
-retry_all:
-       trans->restarted = 0;
-       trans->last_restarted_ip = 0;
-
-       trans_for_each_path(trans, path, i)
-               path->should_be_locked = false;
-
-       btree_trans_sort_paths(trans);
-
-       bch2_trans_unlock(trans);
-       cond_resched();
-       trans_set_locked(trans, false);
-
-       if (unlikely(trans->memory_allocation_failure)) {
-               struct closure cl;
-
-               closure_init_stack(&cl);
-
-               do {
-                       ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-                       closure_sync(&cl);
-               } while (ret);
-       }
-
-       /* Now, redo traversals in correct order: */
-       i = 0;
-       while (i < trans->nr_sorted) {
-               btree_path_idx_t idx = trans->sorted[i];
-
-               /*
-                * Traversing a path can cause another path to be added at about
-                * the same position:
-                */
-               if (trans->paths[idx].uptodate) {
-                       __btree_path_get(trans, &trans->paths[idx], false);
-                       ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
-                       __btree_path_put(trans, &trans->paths[idx], false);
-
-                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-                           bch2_err_matches(ret, ENOMEM))
-                               goto retry_all;
-                       if (ret)
-                               goto err;
-               } else {
-                       i++;
-               }
-       }
-
-       /*
-        * We used to assert that all paths had been traversed here
-        * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
-        * path->should_be_locked is not set yet, we might have unlocked and
-        * then failed to relock a path - that's fine.
-        */
-err:
-       bch2_btree_cache_cannibalize_unlock(trans);
-
-       trans->in_traverse_all = false;
-
-       trace_and_count(c, trans_traverse_all, trans, trace_ip);
-       return ret;
-}
-
-static inline bool btree_path_check_pos_in_node(struct btree_path *path,
-                                               unsigned l, int check_pos)
-{
-       if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
-               return false;
-       if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
-               return false;
-       return true;
-}
-
-static inline bool btree_path_good_node(struct btree_trans *trans,
-                                       struct btree_path *path,
-                                       unsigned l, int check_pos)
-{
-       return is_btree_node(path, l) &&
-               bch2_btree_node_relock(trans, path, l) &&
-               btree_path_check_pos_in_node(path, l, check_pos);
-}
-
-static void btree_path_set_level_down(struct btree_trans *trans,
-                                     struct btree_path *path,
-                                     unsigned new_level)
-{
-       unsigned l;
-
-       path->level = new_level;
-
-       for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
-               if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
-                       btree_node_unlock(trans, path, l);
-
-       btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-       bch2_btree_path_verify(trans, path);
-}
-
-static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
-                                                        struct btree_path *path,
-                                                        int check_pos)
-{
-       unsigned i, l = path->level;
-again:
-       while (btree_path_node(path, l) &&
-              !btree_path_good_node(trans, path, l, check_pos))
-               __btree_path_set_level_up(trans, path, l++);
-
-       /* If we need intent locks, take them too: */
-       for (i = l + 1;
-            i < path->locks_want && btree_path_node(path, i);
-            i++)
-               if (!bch2_btree_node_relock(trans, path, i)) {
-                       while (l <= i)
-                               __btree_path_set_level_up(trans, path, l++);
-                       goto again;
-               }
-
-       return l;
-}
-
-static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
-                                                    struct btree_path *path,
-                                                    int check_pos)
-{
-       return likely(btree_node_locked(path, path->level) &&
-                     btree_path_check_pos_in_node(path, path->level, check_pos))
-               ? path->level
-               : __btree_path_up_until_good_node(trans, path, check_pos);
-}
-
-/*
- * This is the main state machine for walking down the btree - walks down to a
- * specified depth
- *
- * Returns 0 on success, -EIO on error (error reading in a btree node).
- *
- * On error, caller (peek_node()/peek_key()) must return NULL; the error is
- * stashed in the iterator and returned from bch2_trans_exit().
- */
-int bch2_btree_path_traverse_one(struct btree_trans *trans,
-                                btree_path_idx_t path_idx,
-                                unsigned flags,
-                                unsigned long trace_ip)
-{
-       struct btree_path *path = &trans->paths[path_idx];
-       unsigned depth_want = path->level;
-       int ret = -((int) trans->restarted);
-
-       if (unlikely(ret))
-               goto out;
-
-       if (unlikely(!trans->srcu_held))
-               bch2_trans_srcu_lock(trans);
-
-       trace_btree_path_traverse_start(trans, path);
-
-       /*
-        * Ensure we obey path->should_be_locked: if it's set, we can't unlock
-        * and re-traverse the path without a transaction restart:
-        */
-       if (path->should_be_locked) {
-               ret = bch2_btree_path_relock(trans, path, trace_ip);
-               goto out;
-       }
-
-       if (path->cached) {
-               ret = bch2_btree_path_traverse_cached(trans, path_idx, flags);
-               goto out;
-       }
-
-       path = &trans->paths[path_idx];
-
-       if (unlikely(path->level >= BTREE_MAX_DEPTH))
-               goto out_uptodate;
-
-       path->level = btree_path_up_until_good_node(trans, path, 0);
-       unsigned max_level = path->level;
-
-       EBUG_ON(btree_path_node(path, path->level) &&
-               !btree_node_locked(path, path->level));
-
-       /*
-        * Note: path->nodes[path->level] may be temporarily NULL here - that
-        * would indicate to other code that we got to the end of the btree,
-        * here it indicates that relocking the root failed - it's critical that
-        * btree_path_lock_root() comes next and that it can't fail
-        */
-       while (path->level > depth_want) {
-               ret = btree_path_node(path, path->level)
-                       ? btree_path_down(trans, path, flags, trace_ip)
-                       : btree_path_lock_root(trans, path, depth_want, trace_ip);
-               if (unlikely(ret)) {
-                       if (ret == 1) {
-                               /*
-                                * No nodes at this level - got to the end of
-                                * the btree:
-                                */
-                               ret = 0;
-                               goto out;
-                       }
-
-                       __bch2_btree_path_unlock(trans, path);
-                       path->level = depth_want;
-                       path->l[path->level].b = ERR_PTR(ret);
-                       goto out;
-               }
-       }
-
-       if (unlikely(max_level > path->level)) {
-               struct btree_path *linked;
-               unsigned iter;
-
-               trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
-                       for (unsigned j = path->level + 1; j < max_level; j++)
-                               linked->l[j] = path->l[j];
-       }
-
-out_uptodate:
-       path->uptodate = BTREE_ITER_UPTODATE;
-       trace_btree_path_traverse_end(trans, path);
-out:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
-               panic("ret %s (%i) trans->restarted %s (%i)\n",
-                     bch2_err_str(ret), ret,
-                     bch2_err_str(trans->restarted), trans->restarted);
-       bch2_btree_path_verify(trans, path);
-       return ret;
-}
-
-static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
-                           struct btree_path *src)
-{
-       unsigned i, offset = offsetof(struct btree_path, pos);
-
-       memcpy((void *) dst + offset,
-              (void *) src + offset,
-              sizeof(struct btree_path) - offset);
-
-       for (i = 0; i < BTREE_MAX_DEPTH; i++) {
-               unsigned t = btree_node_locked_type(dst, i);
-
-               if (t != BTREE_NODE_UNLOCKED)
-                       six_lock_increment(&dst->l[i].b->c.lock, t);
-       }
-}
-
-static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
-                                        bool intent, unsigned long ip)
-{
-       btree_path_idx_t new = btree_path_alloc(trans, src);
-       btree_path_copy(trans, trans->paths + new, trans->paths + src);
-       __btree_path_get(trans, trans->paths + new, intent);
-#ifdef TRACK_PATH_ALLOCATED
-       trans->paths[new].ip_allocated = ip;
-#endif
-       return new;
-}
-
-__flatten
-btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
-                       btree_path_idx_t path, bool intent, unsigned long ip)
-{
-       struct btree_path *old = trans->paths + path;
-       __btree_path_put(trans, trans->paths + path, intent);
-       path = btree_path_clone(trans, path, intent, ip);
-       trace_btree_path_clone(trans, old, trans->paths + path);
-       trans->paths[path].preserve = false;
-       return path;
-}
-
-btree_path_idx_t __must_check
-__bch2_btree_path_set_pos(struct btree_trans *trans,
-                         btree_path_idx_t path_idx, struct bpos new_pos,
-                         bool intent, unsigned long ip)
-{
-       int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
-
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-       EBUG_ON(!trans->paths[path_idx].ref);
-
-       trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
-
-       path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
-
-       struct btree_path *path = trans->paths + path_idx;
-       path->pos               = new_pos;
-       trans->paths_sorted     = false;
-
-       if (unlikely(path->cached)) {
-               btree_node_unlock(trans, path, 0);
-               path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
-               btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-               goto out;
-       }
-
-       unsigned level = btree_path_up_until_good_node(trans, path, cmp);
-
-       if (btree_path_node(path, level)) {
-               struct btree_path_level *l = &path->l[level];
-
-               BUG_ON(!btree_node_locked(path, level));
-               /*
-                * We might have to skip over many keys, or just a few: try
-                * advancing the node iterator, and if we have to skip over too
-                * many keys just reinit it (or if we're rewinding, since that
-                * is expensive).
-                */
-               if (cmp < 0 ||
-                   !btree_path_advance_to_pos(path, l, 8))
-                       bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
-
-               /*
-                * Iterators to interior nodes should always be pointed at the first non
-                * whiteout:
-                */
-               if (unlikely(level))
-                       bch2_btree_node_iter_peek(&l->iter, l->b);
-       }
-
-       if (unlikely(level != path->level)) {
-               btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-               __bch2_btree_path_unlock(trans, path);
-       }
-out:
-       bch2_btree_path_verify(trans, path);
-       return path_idx;
-}
-
-/* Btree path: main interface: */
-
-static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
-{
-       struct btree_path *sib;
-
-       sib = prev_btree_path(trans, path);
-       if (sib && !btree_path_cmp(sib, path))
-               return sib;
-
-       sib = next_btree_path(trans, path);
-       if (sib && !btree_path_cmp(sib, path))
-               return sib;
-
-       return NULL;
-}
-
-static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
-{
-       struct btree_path *sib;
-
-       sib = prev_btree_path(trans, path);
-       if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
-               return sib;
-
-       sib = next_btree_path(trans, path);
-       if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
-               return sib;
-
-       return NULL;
-}
-
-static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
-{
-       __bch2_btree_path_unlock(trans, trans->paths + path);
-       btree_path_list_remove(trans, trans->paths + path);
-       __clear_bit(path, trans->paths_allocated);
-}
-
-static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
-{
-       unsigned l = path->level;
-
-       do {
-               if (!btree_path_node(path, l))
-                       break;
-
-               if (!is_btree_node(path, l))
-                       return false;
-
-               if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
-                       return false;
-
-               l++;
-       } while (l < path->locks_want);
-
-       return true;
-}
-
-void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
-{
-       struct btree_path *path = trans->paths + path_idx, *dup = NULL;
-
-       if (!__btree_path_put(trans, path, intent))
-               return;
-
-       if (!path->preserve && !path->should_be_locked)
-               goto free;
-
-       dup = path->preserve
-               ? have_path_at_pos(trans, path)
-               : have_node_at_pos(trans, path);
-       if (!dup)
-               return;
-
-       /*
-        * If we need this path locked, the duplicate also has te be locked
-        * before we free this one:
-        */
-       if (path->should_be_locked &&
-           !dup->should_be_locked &&
-           !trans->restarted) {
-               if (!(trans->locked
-                     ? bch2_btree_path_relock_norestart(trans, dup)
-                     : bch2_btree_path_can_relock(trans, dup)))
-                       return;
-
-               dup->should_be_locked = true;
-       }
-
-       BUG_ON(path->should_be_locked &&
-              !trans->restarted &&
-              trans->locked &&
-              !btree_node_locked(dup, dup->level));
-
-       path->should_be_locked = false;
-       dup->preserve |= path->preserve;
-free:
-       trace_btree_path_free(trans, path_idx, dup);
-       __bch2_path_free(trans, path_idx);
-}
-
-void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
-{
-       panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
-             trans->restart_count, restart_count,
-             (void *) trans->last_begin_ip);
-}
-
-static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       struct printbuf buf = PRINTBUF;
-       bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
-       panic("in transaction restart: %s, last restarted by\n%s",
-             bch2_err_str(trans->restarted),
-             buf.buf);
-#else
-       panic("in transaction restart: %s, last restarted by %pS\n",
-             bch2_err_str(trans->restarted),
-             (void *) trans->last_restarted_ip);
-#endif
-}
-
-void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
-{
-       if (trans->restarted)
-               bch2_trans_in_restart_error(trans);
-
-       if (!trans->locked)
-               panic("trans should be locked, unlocked by %pS\n",
-                     (void *) trans->last_unlock_ip);
-
-       BUG();
-}
-
-noinline __cold
-void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
-{
-       prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
-                  trans->nr_updates, trans->fn, trans->journal_res.seq);
-       printbuf_indent_add(buf, 2);
-
-       trans_for_each_update(trans, i) {
-               struct bkey_s_c old = { &i->old_k, i->old_v };
-
-               prt_str(buf, "update: btree=");
-               bch2_btree_id_to_text(buf, i->btree_id);
-               prt_printf(buf, " cached=%u %pS\n",
-                          i->cached,
-                          (void *) i->ip_allocated);
-
-               prt_printf(buf, "  old ");
-               bch2_bkey_val_to_text(buf, trans->c, old);
-               prt_newline(buf);
-
-               prt_printf(buf, "  new ");
-               bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
-               prt_newline(buf);
-       }
-
-       for (struct jset_entry *e = btree_trans_journal_entries_start(trans);
-            e != btree_trans_journal_entries_top(trans);
-            e = vstruct_next(e)) {
-               bch2_journal_entry_to_text(buf, trans->c, e);
-               prt_newline(buf);
-       }
-
-       printbuf_indent_sub(buf, 2);
-}
-
-static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
-{
-       struct btree_path *path = trans->paths + path_idx;
-
-       prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
-                  path_idx, path->ref, path->intent_ref,
-                  path->preserve ? 'P' : ' ',
-                  path->should_be_locked ? 'S' : ' ',
-                  path->cached ? 'C' : 'B');
-       bch2_btree_id_level_to_text(out, path->btree_id, path->level);
-       prt_str(out, " pos ");
-       bch2_bpos_to_text(out, path->pos);
-
-       if (!path->cached && btree_node_locked(path, path->level)) {
-               prt_char(out, ' ');
-               struct btree *b = path_l(path)->b;
-               bch2_bpos_to_text(out, b->data->min_key);
-               prt_char(out, '-');
-               bch2_bpos_to_text(out, b->key.k.p);
-       }
-
-#ifdef TRACK_PATH_ALLOCATED
-       prt_printf(out, " %pS", (void *) path->ip_allocated);
-#endif
-}
-
-static const char *btree_node_locked_str(enum btree_node_locked_type t)
-{
-       switch (t) {
-       case BTREE_NODE_UNLOCKED:
-               return "unlocked";
-       case BTREE_NODE_READ_LOCKED:
-               return "read";
-       case BTREE_NODE_INTENT_LOCKED:
-               return "intent";
-       case BTREE_NODE_WRITE_LOCKED:
-               return "write";
-       default:
-               return NULL;
-       }
-}
-
-void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
-{
-       bch2_btree_path_to_text_short(out, trans, path_idx);
-
-       struct btree_path *path = trans->paths + path_idx;
-
-       prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
-       prt_newline(out);
-
-       printbuf_indent_add(out, 2);
-       for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
-               prt_printf(out, "l=%u locks %s seq %u node ", l,
-                          btree_node_locked_str(btree_node_locked_type(path, l)),
-                          path->l[l].lock_seq);
-
-               int ret = PTR_ERR_OR_ZERO(path->l[l].b);
-               if (ret)
-                       prt_str(out, bch2_err_str(ret));
-               else
-                       prt_printf(out, "%px", path->l[l].b);
-               prt_newline(out);
-       }
-       printbuf_indent_sub(out, 2);
-}
-
-static noinline __cold
-void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
-                               bool nosort)
-{
-       struct trans_for_each_path_inorder_iter iter;
-
-       if (!nosort)
-               btree_trans_sort_paths(trans);
-
-       trans_for_each_path_idx_inorder(trans, iter) {
-               bch2_btree_path_to_text_short(out, trans, iter.path_idx);
-               prt_newline(out);
-       }
-}
-
-noinline __cold
-void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
-{
-       __bch2_trans_paths_to_text(out, trans, false);
-}
-
-static noinline __cold
-void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
-{
-       struct printbuf buf = PRINTBUF;
-
-       __bch2_trans_paths_to_text(&buf, trans, nosort);
-       bch2_trans_updates_to_text(&buf, trans);
-
-       bch2_print_str(trans->c, KERN_ERR, buf.buf);
-       printbuf_exit(&buf);
-}
-
-noinline __cold
-void bch2_dump_trans_paths_updates(struct btree_trans *trans)
-{
-       __bch2_dump_trans_paths_updates(trans, false);
-}
-
-noinline __cold
-static void bch2_trans_update_max_paths(struct btree_trans *trans)
-{
-       struct btree_transaction_stats *s = btree_trans_stats(trans);
-       struct printbuf buf = PRINTBUF;
-       size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
-
-       bch2_trans_paths_to_text(&buf, trans);
-
-       if (!buf.allocation_failure) {
-               mutex_lock(&s->lock);
-               if (nr > s->nr_max_paths) {
-                       s->nr_max_paths = nr;
-                       swap(s->max_paths_text, buf.buf);
-               }
-               mutex_unlock(&s->lock);
-       }
-
-       printbuf_exit(&buf);
-
-       trans->nr_paths_max = nr;
-}
-
-noinline __cold
-int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
-{
-       if (trace_trans_restart_too_many_iters_enabled()) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_trans_paths_to_text(&buf, trans);
-               trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       count_event(trans->c, trans_restart_too_many_iters);
-
-       return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
-}
-
-static noinline void btree_path_overflow(struct btree_trans *trans)
-{
-       bch2_dump_trans_paths_updates(trans);
-       bch_err(trans->c, "trans path overflow");
-}
-
-static noinline void btree_paths_realloc(struct btree_trans *trans)
-{
-       unsigned nr = trans->nr_paths * 2;
-
-       void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
-                         sizeof(struct btree_trans_paths) +
-                         nr * sizeof(struct btree_path) +
-                         nr * sizeof(btree_path_idx_t) + 8 +
-                         nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
-
-       unsigned long *paths_allocated = p;
-       memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
-       p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
-
-       p += sizeof(struct btree_trans_paths);
-       struct btree_path *paths = p;
-       *trans_paths_nr(paths) = nr;
-       memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
-       p += nr * sizeof(struct btree_path);
-
-       btree_path_idx_t *sorted = p;
-       memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
-       p += nr * sizeof(btree_path_idx_t) + 8;
-
-       struct btree_insert_entry *updates = p;
-       memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
-
-       unsigned long *old = trans->paths_allocated;
-
-       rcu_assign_pointer(trans->paths_allocated,      paths_allocated);
-       rcu_assign_pointer(trans->paths,                paths);
-       rcu_assign_pointer(trans->sorted,               sorted);
-       rcu_assign_pointer(trans->updates,              updates);
-
-       trans->nr_paths         = nr;
-
-       if (old != trans->_paths_allocated)
-               kfree_rcu_mightsleep(old);
-}
-
-static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
-                                               btree_path_idx_t pos)
-{
-       btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
-
-       if (unlikely(idx == trans->nr_paths)) {
-               if (trans->nr_paths == BTREE_ITER_MAX) {
-                       btree_path_overflow(trans);
-                       return 0;
-               }
-
-               btree_paths_realloc(trans);
-       }
-
-       /*
-        * Do this before marking the new path as allocated, since it won't be
-        * initialized yet:
-        */
-       if (unlikely(idx > trans->nr_paths_max))
-               bch2_trans_update_max_paths(trans);
-
-       __set_bit(idx, trans->paths_allocated);
-
-       struct btree_path *path = &trans->paths[idx];
-       path->ref               = 0;
-       path->intent_ref        = 0;
-       path->nodes_locked      = 0;
-
-       btree_path_list_add(trans, pos, idx);
-       trans->paths_sorted = false;
-       return idx;
-}
-
-btree_path_idx_t bch2_path_get(struct btree_trans *trans,
-                            enum btree_id btree_id, struct bpos pos,
-                            unsigned locks_want, unsigned level,
-                            unsigned flags, unsigned long ip)
-{
-       struct btree_path *path;
-       bool cached = flags & BTREE_ITER_cached;
-       bool intent = flags & BTREE_ITER_intent;
-       struct trans_for_each_path_inorder_iter iter;
-       btree_path_idx_t path_pos = 0, path_idx;
-
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-       bch2_trans_verify_locks(trans);
-
-       btree_trans_sort_paths(trans);
-
-       if (intent)
-               locks_want = max(locks_want, level + 1);
-       locks_want = min(locks_want, BTREE_MAX_DEPTH);
-
-       trans_for_each_path_inorder(trans, path, iter) {
-               if (__btree_path_cmp(path,
-                                    btree_id,
-                                    cached,
-                                    pos,
-                                    level) > 0)
-                       break;
-
-               path_pos = iter.path_idx;
-       }
-
-       if (path_pos &&
-           trans->paths[path_pos].cached       == cached &&
-           trans->paths[path_pos].btree_id     == btree_id &&
-           trans->paths[path_pos].level        == level &&
-           bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) {
-               trace_btree_path_get(trans, trans->paths + path_pos, &pos);
-
-               __btree_path_get(trans, trans->paths + path_pos, intent);
-               path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
-               path = trans->paths + path_idx;
-       } else {
-               path_idx = btree_path_alloc(trans, path_pos);
-               path = trans->paths + path_idx;
-
-               __btree_path_get(trans, path, intent);
-               path->pos                       = pos;
-               path->btree_id                  = btree_id;
-               path->cached                    = cached;
-               path->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
-               path->should_be_locked          = false;
-               path->level                     = level;
-               path->locks_want                = locks_want;
-               path->nodes_locked              = 0;
-               for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
-                       path->l[i].b            = ERR_PTR(-BCH_ERR_no_btree_node_init);
-#ifdef TRACK_PATH_ALLOCATED
-               path->ip_allocated              = ip;
-#endif
-               trans->paths_sorted             = false;
-
-               trace_btree_path_alloc(trans, path);
-       }
-
-       if (!(flags & BTREE_ITER_nopreserve))
-               path->preserve = true;
-
-       /*
-        * If the path has locks_want greater than requested, we don't downgrade
-        * it here - on transaction restart because btree node split needs to
-        * upgrade locks, we might be putting/getting the iterator again.
-        * Downgrading iterators only happens via bch2_trans_downgrade(), after
-        * a successful transaction commit.
-        */
-
-       return path_idx;
-}
-
-btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
-                                           enum btree_id btree_id,
-                                           unsigned level,
-                                           struct bpos pos)
-{
-       btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
-                            BTREE_ITER_nopreserve|
-                            BTREE_ITER_intent, _RET_IP_);
-       path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
-
-       struct btree_path *path = trans->paths + path_idx;
-       bch2_btree_path_downgrade(trans, path);
-       __bch2_btree_path_unlock(trans, path);
-       return path_idx;
-}
-
-struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
-{
-
-       struct btree_path_level *l = path_l(path);
-       struct bkey_packed *_k;
-       struct bkey_s_c k;
-
-       if (unlikely(!l->b))
-               return bkey_s_c_null;
-
-       EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
-       EBUG_ON(!btree_node_locked(path, path->level));
-
-       if (!path->cached) {
-               _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-               k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
-
-               EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
-
-               if (!k.k || !bpos_eq(path->pos, k.k->p))
-                       goto hole;
-       } else {
-               struct bkey_cached *ck = (void *) path->l[0].b;
-               if (!ck)
-                       return bkey_s_c_null;
-
-               EBUG_ON(path->btree_id != ck->key.btree_id ||
-                       !bkey_eq(path->pos, ck->key.pos));
-
-               *u = ck->k->k;
-               k = (struct bkey_s_c) { u, &ck->k->v };
-       }
-
-       return k;
-hole:
-       bkey_init(u);
-       u->p = path->pos;
-       return (struct bkey_s_c) { u, NULL };
-}
-
-void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
-{
-       if (!iter->path || trans->restarted)
-               return;
-
-       struct btree_path *path = btree_iter_path(trans, iter);
-       path->preserve          = false;
-       if (path->ref == 1)
-               path->should_be_locked  = false;
-}
-/* Btree iterators: */
-
-int __must_check
-__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
-{
-       return bch2_btree_path_traverse(trans, iter->path, iter->flags);
-}
-
-int __must_check
-bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
-{
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-       iter->path = bch2_btree_path_set_pos(trans, iter->path,
-                                       btree_iter_search_key(iter),
-                                       iter->flags & BTREE_ITER_intent,
-                                       btree_iter_ip_allocated(iter));
-
-       int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-       if (ret)
-               return ret;
-
-       struct btree_path *path = btree_iter_path(trans, iter);
-       if (btree_path_node(path, path->level))
-               btree_path_set_should_be_locked(trans, path);
-       return 0;
-}
-
-/* Iterate across nodes (leaf and interior nodes) */
-
-struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans,
-                                       struct btree_iter *iter)
-{
-       struct btree *b = NULL;
-       int ret;
-
-       EBUG_ON(trans->paths[iter->path].cached);
-       bch2_btree_iter_verify(trans, iter);
-
-       ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-       if (ret)
-               goto err;
-
-       struct btree_path *path = btree_iter_path(trans, iter);
-       b = btree_path_node(path, path->level);
-       if (!b)
-               goto out;
-
-       BUG_ON(bpos_lt(b->key.k.p, iter->pos));
-
-       bkey_init(&iter->k);
-       iter->k.p = iter->pos = b->key.k.p;
-
-       iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
-                                       iter->flags & BTREE_ITER_intent,
-                                       btree_iter_ip_allocated(iter));
-       btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out:
-       bch2_btree_iter_verify_entry_exit(iter);
-       bch2_btree_iter_verify(trans, iter);
-
-       return b;
-err:
-       b = ERR_PTR(ret);
-       goto out;
-}
-
-/* Only kept for -tools */
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans,
-                                                   struct btree_iter *iter)
-{
-       struct btree *b;
-
-       while (b = bch2_btree_iter_peek_node(trans, iter),
-              bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
-               bch2_trans_begin(trans);
-
-       return b;
-}
-
-struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter)
-{
-       struct btree *b = NULL;
-       int ret;
-
-       EBUG_ON(trans->paths[iter->path].cached);
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-       bch2_btree_iter_verify(trans, iter);
-
-       ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-       if (ret)
-               goto err;
-
-
-       struct btree_path *path = btree_iter_path(trans, iter);
-
-       /* already at end? */
-       if (!btree_path_node(path, path->level))
-               return NULL;
-
-       /* got to end? */
-       if (!btree_path_node(path, path->level + 1)) {
-               path->should_be_locked = false;
-               btree_path_set_level_up(trans, path);
-               return NULL;
-       }
-
-       /*
-        * We don't correctly handle nodes with extra intent locks here:
-        * downgrade so we don't violate locking invariants
-        */
-       bch2_btree_path_downgrade(trans, path);
-
-       if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
-               trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
-               ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
-               __bch2_btree_path_unlock(trans, path);
-               path->l[path->level].b          = ERR_PTR(-BCH_ERR_no_btree_node_relock);
-               path->l[path->level + 1].b      = ERR_PTR(-BCH_ERR_no_btree_node_relock);
-               btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-               goto err;
-       }
-
-       b = btree_path_node(path, path->level + 1);
-
-       if (bpos_eq(iter->pos, b->key.k.p)) {
-               __btree_path_set_level_up(trans, path, path->level++);
-       } else {
-               if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
-                       btree_node_unlock(trans, path, path->level + 1);
-
-               /*
-                * Haven't gotten to the end of the parent node: go back down to
-                * the next child node
-                */
-               iter->path = bch2_btree_path_set_pos(trans, iter->path,
-                                       bpos_successor(iter->pos),
-                                       iter->flags & BTREE_ITER_intent,
-                                       btree_iter_ip_allocated(iter));
-
-               path = btree_iter_path(trans, iter);
-               btree_path_set_level_down(trans, path, iter->min_depth);
-
-               ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-               if (ret)
-                       goto err;
-
-               path = btree_iter_path(trans, iter);
-               b = path->l[path->level].b;
-       }
-
-       bkey_init(&iter->k);
-       iter->k.p = iter->pos = b->key.k.p;
-
-       iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
-                                       iter->flags & BTREE_ITER_intent,
-                                       btree_iter_ip_allocated(iter));
-       btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-       EBUG_ON(btree_iter_path(trans, iter)->uptodate);
-out:
-       bch2_btree_iter_verify_entry_exit(iter);
-       bch2_btree_iter_verify(trans, iter);
-
-       return b;
-err:
-       b = ERR_PTR(ret);
-       goto out;
-}
-
-/* Iterate across keys (in leaf nodes only) */
-
-inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter)
-{
-       struct bpos pos = iter->k.p;
-       bool ret = !(iter->flags & BTREE_ITER_all_snapshots
-                    ? bpos_eq(pos, SPOS_MAX)
-                    : bkey_eq(pos, SPOS_MAX));
-
-       if (ret && !(iter->flags & BTREE_ITER_is_extents))
-               pos = bkey_successor(iter, pos);
-       bch2_btree_iter_set_pos(trans, iter, pos);
-       return ret;
-}
-
-inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter)
-{
-       struct bpos pos = bkey_start_pos(&iter->k);
-       bool ret = !(iter->flags & BTREE_ITER_all_snapshots
-                    ? bpos_eq(pos, POS_MIN)
-                    : bkey_eq(pos, POS_MIN));
-
-       if (ret && !(iter->flags & BTREE_ITER_is_extents))
-               pos = bkey_predecessor(iter, pos);
-       bch2_btree_iter_set_pos(trans, iter, pos);
-       return ret;
-}
-
-static noinline
-void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
-                                       struct bpos search_key, struct bkey_s_c *k)
-{
-       struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
-
-       trans_for_each_update(trans, i)
-               if (!i->key_cache_already_flushed &&
-                   i->btree_id == iter->btree_id &&
-                   bpos_le(i->k->k.p, search_key) &&
-                   bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
-                       iter->k = i->k->k;
-                       *k = bkey_i_to_s_c(i->k);
-               }
-}
-
-static noinline
-void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
-                                  struct bpos search_key,
-                                  struct bkey_s_c *k)
-{
-       struct btree_path *path = btree_iter_path(trans, iter);
-       struct bpos end = path_l(path)->b->key.k.p;
-
-       trans_for_each_update(trans, i)
-               if (!i->key_cache_already_flushed &&
-                   i->btree_id == iter->btree_id &&
-                   bpos_ge(i->k->k.p, search_key) &&
-                   bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
-                       iter->k = i->k->k;
-                       *k = bkey_i_to_s_c(i->k);
-               }
-}
-
-static noinline
-void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
-                                       struct bkey_s_c *k)
-{
-       trans_for_each_update(trans, i)
-               if (!i->key_cache_already_flushed &&
-                   i->btree_id == iter->btree_id &&
-                   bpos_eq(i->k->k.p, iter->pos)) {
-                       iter->k = i->k->k;
-                       *k = bkey_i_to_s_c(i->k);
-               }
-}
-
-static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
-                                             struct btree_iter *iter,
-                                             struct bpos search_pos,
-                                             struct bpos end_pos)
-{
-       struct btree_path *path = btree_iter_path(trans, iter);
-
-       return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
-                                          path->level,
-                                          search_pos,
-                                          end_pos,
-                                          &iter->journal_idx);
-}
-
-static noinline
-struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
-                                             struct btree_iter *iter)
-{
-       struct btree_path *path = btree_iter_path(trans, iter);
-       struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
-
-       if (k) {
-               iter->k = k->k;
-               return bkey_i_to_s_c(k);
-       } else {
-               return bkey_s_c_null;
-       }
-}
-
-static noinline
-void btree_trans_peek_journal(struct btree_trans *trans,
-                             struct btree_iter *iter,
-                             struct bpos search_key,
-                             struct bkey_s_c *k)
-{
-       struct btree_path *path = btree_iter_path(trans, iter);
-       struct bkey_i *next_journal =
-               bch2_btree_journal_peek(trans, iter, search_key,
-                               k->k ? k->k->p : path_l(path)->b->key.k.p);
-       if (next_journal) {
-               iter->k = next_journal->k;
-               *k = bkey_i_to_s_c(next_journal);
-       }
-}
-
-static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
-                                             struct btree_iter *iter,
-                                             struct bpos search_key,
-                                             struct bpos end_pos)
-{
-       struct btree_path *path = btree_iter_path(trans, iter);
-
-       return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
-                                          path->level,
-                                          search_key,
-                                          end_pos,
-                                          &iter->journal_idx);
-}
-
-static noinline
-void btree_trans_peek_prev_journal(struct btree_trans *trans,
-                                  struct btree_iter *iter,
-                                  struct bpos search_key,
-                                  struct bkey_s_c *k)
-{
-       struct btree_path *path = btree_iter_path(trans, iter);
-       struct bkey_i *next_journal =
-               bch2_btree_journal_peek_prev(trans, iter, search_key,
-                               k->k ? k->k->p : path_l(path)->b->data->min_key);
-
-       if (next_journal) {
-               iter->k = next_journal->k;
-               *k = bkey_i_to_s_c(next_journal);
-       }
-}
-
-/*
- * Checks btree key cache for key at iter->pos and returns it if present, or
- * bkey_s_c_null:
- */
-static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter,
-                                          struct bpos pos)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey u;
-       struct bkey_s_c k;
-       int ret;
-
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-       if ((iter->flags & BTREE_ITER_key_cache_fill) &&
-           bpos_eq(iter->pos, pos))
-               return bkey_s_c_null;
-
-       if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
-               return bkey_s_c_null;
-
-       if (!iter->key_cache_path)
-               iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
-                                                    iter->flags & BTREE_ITER_intent, 0,
-                                                    iter->flags|BTREE_ITER_cached|
-                                                    BTREE_ITER_cached_nofill,
-                                                    _THIS_IP_);
-
-       iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
-                                       iter->flags & BTREE_ITER_intent,
-                                       btree_iter_ip_allocated(iter));
-
-       ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
-                                        iter->flags|BTREE_ITER_cached) ?:
-               bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
-       if (unlikely(ret))
-               return bkey_s_c_err(ret);
-
-       k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
-       if (!k.k)
-               return k;
-
-       if ((iter->flags & BTREE_ITER_all_snapshots) &&
-           !bpos_eq(pos, k.k->p))
-               return bkey_s_c_null;
-
-       iter->k = u;
-       k.k = &iter->k;
-       btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
-       return k;
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter,
-                                             struct bpos search_key)
-{
-       struct bkey_s_c k, k2;
-       int ret;
-
-       EBUG_ON(btree_iter_path(trans, iter)->cached);
-       bch2_btree_iter_verify(trans, iter);
-
-       while (1) {
-               iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-                                       iter->flags & BTREE_ITER_intent,
-                                       btree_iter_ip_allocated(iter));
-
-               ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-               if (unlikely(ret)) {
-                       /* ensure that iter->k is consistent with iter->pos: */
-                       bch2_btree_iter_set_pos(trans, iter, iter->pos);
-                       k = bkey_s_c_err(ret);
-                       break;
-               }
-
-               struct btree_path *path = btree_iter_path(trans, iter);
-               struct btree_path_level *l = path_l(path);
-
-               if (unlikely(!l->b)) {
-                       /* No btree nodes at requested level: */
-                       bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
-                       k = bkey_s_c_null;
-                       break;
-               }
-
-               btree_path_set_should_be_locked(trans, path);
-
-               k = btree_path_level_peek_all(trans->c, l, &iter->k);
-
-               if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
-                   k.k &&
-                   (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
-                       k = k2;
-                       if (bkey_err(k)) {
-                               bch2_btree_iter_set_pos(trans, iter, iter->pos);
-                               break;
-                       }
-               }
-
-               if (unlikely(iter->flags & BTREE_ITER_with_journal))
-                       btree_trans_peek_journal(trans, iter, search_key, &k);
-
-               if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
-                            trans->nr_updates))
-                       bch2_btree_trans_peek_updates(trans, iter, search_key, &k);
-
-               if (k.k && bkey_deleted(k.k)) {
-                       /*
-                        * If we've got a whiteout, and it's after the search
-                        * key, advance the search key to the whiteout instead
-                        * of just after the whiteout - it might be a btree
-                        * whiteout, with a real key at the same position, since
-                        * in the btree deleted keys sort before non deleted.
-                        */
-                       search_key = !bpos_eq(search_key, k.k->p)
-                               ? k.k->p
-                               : bpos_successor(k.k->p);
-                       continue;
-               }
-
-               if (likely(k.k)) {
-                       break;
-               } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
-                       /* Advance to next leaf node: */
-                       search_key = bpos_successor(l->b->key.k.p);
-               } else {
-                       /* End of btree: */
-                       bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
-                       k = bkey_s_c_null;
-                       break;
-               }
-       }
-
-       bch2_btree_iter_verify(trans, iter);
-
-       if (trace___btree_iter_peek_enabled()) {
-               CLASS(printbuf, buf)();
-
-               int ret = bkey_err(k);
-               if (ret)
-                       prt_str(&buf, bch2_err_str(ret));
-               else if (k.k)
-                       bch2_bkey_val_to_text(&buf, trans->c, k);
-               else
-                       prt_str(&buf, "(null)");
-               trace___btree_iter_peek(trans->c, buf.buf);
-       }
-
-       return k;
-}
-
-/**
- * bch2_btree_iter_peek_max() - returns first key greater than or equal to
- * iterator's current position
- * @trans:     btree transaction object
- * @iter:      iterator to peek from
- * @end:       search limit: returns keys less than or equal to @end
- *
- * Returns:    key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter,
-                                        struct bpos end)
-{
-       struct bpos search_key = btree_iter_search_key(iter);
-       struct bkey_s_c k;
-       struct bpos iter_pos = iter->pos;
-       int ret;
-
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-       bch2_btree_iter_verify_entry_exit(iter);
-       EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
-
-       ret = trans_maybe_inject_restart(trans, _RET_IP_);
-       if (unlikely(ret)) {
-               k = bkey_s_c_err(ret);
-               goto out_no_locked;
-       }
-
-       if (iter->update_path) {
-               bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent);
-               iter->update_path = 0;
-       }
-
-       while (1) {
-               k = __bch2_btree_iter_peek(trans, iter, search_key);
-               if (unlikely(!k.k))
-                       goto end;
-               if (unlikely(bkey_err(k)))
-                       goto out_no_locked;
-
-               if (iter->flags & BTREE_ITER_filter_snapshots) {
-                       /*
-                        * We need to check against @end before FILTER_SNAPSHOTS because
-                        * if we get to a different inode that requested we might be
-                        * seeing keys for a different snapshot tree that will all be
-                        * filtered out.
-                        *
-                        * But we can't do the full check here, because bkey_start_pos()
-                        * isn't monotonically increasing before FILTER_SNAPSHOTS, and
-                        * that's what we check against in extents mode:
-                        */
-                       if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
-                                    ? bkey_gt(k.k->p, end)
-                                    : k.k->p.inode > end.inode))
-                               goto end;
-
-                       if (iter->update_path &&
-                           !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
-                               bch2_path_put(trans, iter->update_path,
-                                             iter->flags & BTREE_ITER_intent);
-                               iter->update_path = 0;
-                       }
-
-                       if ((iter->flags & BTREE_ITER_intent) &&
-                           !(iter->flags & BTREE_ITER_is_extents) &&
-                           !iter->update_path) {
-                               struct bpos pos = k.k->p;
-
-                               if (pos.snapshot < iter->snapshot) {
-                                       search_key = bpos_successor(k.k->p);
-                                       continue;
-                               }
-
-                               pos.snapshot = iter->snapshot;
-
-                               /*
-                                * advance, same as on exit for iter->path, but only up
-                                * to snapshot
-                                */
-                               __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
-                               iter->update_path = iter->path;
-
-                               iter->update_path = bch2_btree_path_set_pos(trans,
-                                                       iter->update_path, pos,
-                                                       iter->flags & BTREE_ITER_intent,
-                                                       _THIS_IP_);
-                               ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
-                               if (unlikely(ret)) {
-                                       k = bkey_s_c_err(ret);
-                                       goto out_no_locked;
-                               }
-                       }
-
-                       /*
-                        * We can never have a key in a leaf node at POS_MAX, so
-                        * we don't have to check these successor() calls:
-                        */
-                       if (!bch2_snapshot_is_ancestor(trans->c,
-                                                      iter->snapshot,
-                                                      k.k->p.snapshot)) {
-                               search_key = bpos_successor(k.k->p);
-                               continue;
-                       }
-
-                       if (bkey_whiteout(k.k) &&
-                           !(iter->flags & BTREE_ITER_key_cache_fill)) {
-                               search_key = bkey_successor(iter, k.k->p);
-                               continue;
-                       }
-               }
-
-               /*
-                * iter->pos should be mononotically increasing, and always be
-                * equal to the key we just returned - except extents can
-                * straddle iter->pos:
-                */
-               if (!(iter->flags & BTREE_ITER_is_extents))
-                       iter_pos = k.k->p;
-               else
-                       iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
-
-               if (unlikely(iter->flags & BTREE_ITER_all_snapshots     ? bpos_gt(iter_pos, end) :
-                            iter->flags & BTREE_ITER_is_extents        ? bkey_ge(iter_pos, end) :
-                                                                         bkey_gt(iter_pos, end)))
-                       goto end;
-
-               break;
-       }
-
-       iter->pos = iter_pos;
-
-       iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
-                               iter->flags & BTREE_ITER_intent,
-                               btree_iter_ip_allocated(iter));
-
-       btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out_no_locked:
-       if (iter->update_path) {
-               ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
-               if (unlikely(ret))
-                       k = bkey_s_c_err(ret);
-               else
-                       btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
-       }
-
-       if (!(iter->flags & BTREE_ITER_all_snapshots))
-               iter->pos.snapshot = iter->snapshot;
-
-       ret = bch2_btree_iter_verify_ret(trans, iter, k);
-       if (unlikely(ret)) {
-               bch2_btree_iter_set_pos(trans, iter, iter->pos);
-               k = bkey_s_c_err(ret);
-       }
-
-       bch2_btree_iter_verify_entry_exit(iter);
-
-       if (trace_btree_iter_peek_max_enabled()) {
-               CLASS(printbuf, buf)();
-
-               int ret = bkey_err(k);
-               if (ret)
-                       prt_str(&buf, bch2_err_str(ret));
-               else if (k.k)
-                       bch2_bkey_val_to_text(&buf, trans->c, k);
-               else
-                       prt_str(&buf, "(null)");
-               trace_btree_iter_peek_max(trans->c, buf.buf);
-       }
-
-       return k;
-end:
-       bch2_btree_iter_set_pos(trans, iter, end);
-       k = bkey_s_c_null;
-       goto out_no_locked;
-}
-
-/**
- * bch2_btree_iter_next() - returns first key greater than iterator's current
- * position
- * @trans:     btree transaction object
- * @iter:      iterator to peek from
- *
- * Returns:    key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter)
-{
-       if (!bch2_btree_iter_advance(trans, iter))
-               return bkey_s_c_null;
-
-       return bch2_btree_iter_peek(trans, iter);
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter,
-                                                  struct bpos search_key)
-{
-       struct bkey_s_c k, k2;
-
-       bch2_btree_iter_verify(trans, iter);
-
-       while (1) {
-               iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-                                       iter->flags & BTREE_ITER_intent,
-                                       btree_iter_ip_allocated(iter));
-
-               int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-               if (unlikely(ret)) {
-                       /* ensure that iter->k is consistent with iter->pos: */
-                       bch2_btree_iter_set_pos(trans, iter, iter->pos);
-                       k = bkey_s_c_err(ret);
-                       break;
-               }
-
-               struct btree_path *path = btree_iter_path(trans, iter);
-               struct btree_path_level *l = path_l(path);
-
-               if (unlikely(!l->b)) {
-                       /* No btree nodes at requested level: */
-                       bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
-                       k = bkey_s_c_null;
-                       break;
-               }
-
-               btree_path_set_should_be_locked(trans, path);
-
-               k = btree_path_level_peek_all(trans->c, l, &iter->k);
-               if (!k.k || bpos_gt(k.k->p, search_key)) {
-                       k = btree_path_level_prev(trans, path, l, &iter->k);
-
-                       BUG_ON(k.k && bpos_gt(k.k->p, search_key));
-               }
-
-               if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
-                   k.k &&
-                   (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
-                       k = k2;
-                       if (bkey_err(k2)) {
-                               bch2_btree_iter_set_pos(trans, iter, iter->pos);
-                               break;
-                       }
-               }
-
-               if (unlikely(iter->flags & BTREE_ITER_with_journal))
-                       btree_trans_peek_prev_journal(trans, iter, search_key, &k);
-
-               if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
-                            trans->nr_updates))
-                       bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k);
-
-               if (likely(k.k && !bkey_deleted(k.k))) {
-                       break;
-               } else if (k.k) {
-                       search_key = bpos_predecessor(k.k->p);
-               } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
-                       /* Advance to previous leaf node: */
-                       search_key = bpos_predecessor(path->l[0].b->data->min_key);
-               } else {
-                       /* Start of btree: */
-                       bch2_btree_iter_set_pos(trans, iter, POS_MIN);
-                       k = bkey_s_c_null;
-                       break;
-               }
-       }
-
-       bch2_btree_iter_verify(trans, iter);
-       return k;
-}
-
-/**
- * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
- * iterator's current position
- * @trans:     btree transaction object
- * @iter:      iterator to peek from
- * @end:       search limit: returns keys greater than or equal to @end
- *
- * Returns:    key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter,
-                                             struct bpos end)
-{
-       if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
-          !bkey_eq(iter->pos, POS_MAX) &&
-          !((iter->flags & BTREE_ITER_is_extents) &&
-            iter->pos.offset == U64_MAX)) {
-
-               /*
-                * bkey_start_pos(), for extents, is not monotonically
-                * increasing until after filtering for snapshots:
-                *
-                * Thus, for extents we need to search forward until we find a
-                * real visible extents - easiest to just use peek_slot() (which
-                * internally uses peek() for extents)
-                */
-               struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
-               if (bkey_err(k))
-                       return k;
-
-               if (!bkey_deleted(k.k) &&
-                   (!(iter->flags & BTREE_ITER_is_extents) ||
-                    bkey_lt(bkey_start_pos(k.k), iter->pos)))
-                       return k;
-       }
-
-       struct bpos search_key = iter->pos;
-       struct bkey_s_c k;
-       btree_path_idx_t saved_path = 0;
-
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-       bch2_btree_iter_verify_entry_exit(iter);
-       EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);
-
-       int ret = trans_maybe_inject_restart(trans, _RET_IP_);
-       if (unlikely(ret)) {
-               k = bkey_s_c_err(ret);
-               goto out_no_locked;
-       }
-
-       while (1) {
-               k = __bch2_btree_iter_peek_prev(trans, iter, search_key);
-               if (unlikely(!k.k))
-                       goto end;
-               if (unlikely(bkey_err(k)))
-                       goto out_no_locked;
-
-               if (iter->flags & BTREE_ITER_filter_snapshots) {
-                       struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
-                       if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
-                               /*
-                                * If we have a saved candidate, and we're past
-                                * the last possible snapshot overwrite, return
-                                * it:
-                                */
-                               bch2_path_put(trans, iter->path,
-                                             iter->flags & BTREE_ITER_intent);
-                               iter->path = saved_path;
-                               saved_path = 0;
-                               k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
-                               break;
-                       }
-
-                       /*
-                        * We need to check against @end before FILTER_SNAPSHOTS because
-                        * if we get to a different inode that requested we might be
-                        * seeing keys for a different snapshot tree that will all be
-                        * filtered out.
-                        */
-                       if (unlikely(bkey_lt(k.k->p, end)))
-                               goto end;
-
-                       if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
-                               search_key = bpos_predecessor(k.k->p);
-                               continue;
-                       }
-
-                       if (k.k->p.snapshot != iter->snapshot) {
-                               /*
-                                * Have a key visible in iter->snapshot, but
-                                * might have overwrites: - save it and keep
-                                * searching. Unless it's a whiteout - then drop
-                                * our previous saved candidate:
-                                */
-                               if (saved_path) {
-                                       bch2_path_put(trans, saved_path,
-                                                     iter->flags & BTREE_ITER_intent);
-                                       saved_path = 0;
-                               }
-
-                               if (!bkey_whiteout(k.k)) {
-                                       saved_path = btree_path_clone(trans, iter->path,
-                                                               iter->flags & BTREE_ITER_intent,
-                                                               _THIS_IP_);
-                                       trace_btree_path_save_pos(trans,
-                                                                 trans->paths + iter->path,
-                                                                 trans->paths + saved_path);
-                               }
-
-                               search_key = bpos_predecessor(k.k->p);
-                               continue;
-                       }
-
-                       if (bkey_whiteout(k.k)) {
-                               search_key = bkey_predecessor(iter, k.k->p);
-                               search_key.snapshot = U32_MAX;
-                               continue;
-                       }
-               }
-
-               EBUG_ON(iter->flags & BTREE_ITER_all_snapshots          ? bpos_gt(k.k->p, iter->pos) :
-                       iter->flags & BTREE_ITER_is_extents             ? bkey_ge(bkey_start_pos(k.k), iter->pos) :
-                                                                         bkey_gt(k.k->p, iter->pos));
-
-               if (unlikely(iter->flags & BTREE_ITER_all_snapshots     ? bpos_lt(k.k->p, end) :
-                            iter->flags & BTREE_ITER_is_extents        ? bkey_le(k.k->p, end) :
-                                                                         bkey_lt(k.k->p, end)))
-                       goto end;
-
-               break;
-       }
-
-       /* Extents can straddle iter->pos: */
-       iter->pos = bpos_min(iter->pos, k.k->p);;
-
-       if (iter->flags & BTREE_ITER_filter_snapshots)
-               iter->pos.snapshot = iter->snapshot;
-out_no_locked:
-       if (saved_path)
-               bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent);
-
-       bch2_btree_iter_verify_entry_exit(iter);
-       bch2_btree_iter_verify(trans, iter);
-
-       if (trace_btree_iter_peek_prev_min_enabled()) {
-               CLASS(printbuf, buf)();
-
-               int ret = bkey_err(k);
-               if (ret)
-                       prt_str(&buf, bch2_err_str(ret));
-               else if (k.k)
-                       bch2_bkey_val_to_text(&buf, trans->c, k);
-               else
-                       prt_str(&buf, "(null)");
-               trace_btree_iter_peek_prev_min(trans->c, buf.buf);
-       }
-       return k;
-end:
-       bch2_btree_iter_set_pos(trans, iter, end);
-       k = bkey_s_c_null;
-       goto out_no_locked;
-}
-
-/**
- * bch2_btree_iter_prev() - returns first key less than iterator's current
- * position
- * @trans:     btree transaction object
- * @iter:      iterator to peek from
- *
- * Returns:    key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter)
-{
-       if (!bch2_btree_iter_rewind(trans, iter))
-               return bkey_s_c_null;
-
-       return bch2_btree_iter_peek_prev(trans, iter);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter)
-{
-       struct bpos search_key;
-       struct bkey_s_c k;
-       int ret;
-
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-       bch2_btree_iter_verify(trans, iter);
-       bch2_btree_iter_verify_entry_exit(iter);
-       EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
-
-       ret = trans_maybe_inject_restart(trans, _RET_IP_);
-       if (unlikely(ret)) {
-               k = bkey_s_c_err(ret);
-               goto out;
-       }
-
-       /* extents can't span inode numbers: */
-       if ((iter->flags & BTREE_ITER_is_extents) &&
-           unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
-               if (iter->pos.inode == KEY_INODE_MAX) {
-                       k = bkey_s_c_null;
-                       goto out2;
-               }
-
-               bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
-       }
-
-       search_key = btree_iter_search_key(iter);
-       iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-                                       iter->flags & BTREE_ITER_intent,
-                                       btree_iter_ip_allocated(iter));
-
-       ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-       if (unlikely(ret)) {
-               k = bkey_s_c_err(ret);
-               goto out;
-       }
-
-       struct btree_path *path = btree_iter_path(trans, iter);
-       if (unlikely(!btree_path_node(path, path->level))) {
-               k = bkey_s_c_null;
-               goto out2;
-       }
-
-       btree_path_set_should_be_locked(trans, path);
-
-       if ((iter->flags & BTREE_ITER_cached) ||
-           !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
-               k = bkey_s_c_null;
-
-               if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
-                            trans->nr_updates)) {
-                       bch2_btree_trans_peek_slot_updates(trans, iter, &k);
-                       if (k.k)
-                               goto out;
-               }
-
-               if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
-                   (k = btree_trans_peek_slot_journal(trans, iter)).k)
-                       goto out;
-
-               if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
-                   (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) {
-                       if (!bkey_err(k))
-                               iter->k = *k.k;
-                       /* We're not returning a key from iter->path: */
-                       goto out;
-               }
-
-               k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
-               if (unlikely(!k.k))
-                       goto out;
-
-               if (unlikely(k.k->type == KEY_TYPE_whiteout &&
-                            (iter->flags & BTREE_ITER_filter_snapshots) &&
-                            !(iter->flags & BTREE_ITER_key_cache_fill)))
-                       iter->k.type = KEY_TYPE_deleted;
-       } else {
-               struct bpos next;
-               struct bpos end = iter->pos;
-
-               if (iter->flags & BTREE_ITER_is_extents)
-                       end.offset = U64_MAX;
-
-               EBUG_ON(btree_iter_path(trans, iter)->level);
-
-               if (iter->flags & BTREE_ITER_intent) {
-                       struct btree_iter iter2;
-
-                       bch2_trans_copy_iter(trans, &iter2, iter);
-                       k = bch2_btree_iter_peek_max(trans, &iter2, end);
-
-                       if (k.k && !bkey_err(k)) {
-                               swap(iter->key_cache_path, iter2.key_cache_path);
-                               iter->k = iter2.k;
-                               k.k = &iter->k;
-                       }
-                       bch2_trans_iter_exit(trans, &iter2);
-               } else {
-                       struct bpos pos = iter->pos;
-
-                       k = bch2_btree_iter_peek_max(trans, iter, end);
-                       if (unlikely(bkey_err(k)))
-                               bch2_btree_iter_set_pos(trans, iter, pos);
-                       else
-                               iter->pos = pos;
-               }
-
-               if (unlikely(bkey_err(k)))
-                       goto out;
-
-               next = k.k ? bkey_start_pos(k.k) : POS_MAX;
-
-               if (bkey_lt(iter->pos, next)) {
-                       bkey_init(&iter->k);
-                       iter->k.p = iter->pos;
-
-                       if (iter->flags & BTREE_ITER_is_extents) {
-                               bch2_key_resize(&iter->k,
-                                               min_t(u64, KEY_SIZE_MAX,
-                                                     (next.inode == iter->pos.inode
-                                                      ? next.offset
-                                                      : KEY_OFFSET_MAX) -
-                                                     iter->pos.offset));
-                               EBUG_ON(!iter->k.size);
-                       }
-
-                       k = (struct bkey_s_c) { &iter->k, NULL };
-               }
-       }
-out:
-       bch2_btree_iter_verify_entry_exit(iter);
-       bch2_btree_iter_verify(trans, iter);
-       ret = bch2_btree_iter_verify_ret(trans, iter, k);
-       if (unlikely(ret))
-               k = bkey_s_c_err(ret);
-out2:
-       if (trace_btree_iter_peek_slot_enabled()) {
-               CLASS(printbuf, buf)();
-
-               int ret = bkey_err(k);
-               if (ret)
-                       prt_str(&buf, bch2_err_str(ret));
-               else if (k.k)
-                       bch2_bkey_val_to_text(&buf, trans->c, k);
-               else
-                       prt_str(&buf, "(null)");
-               trace_btree_iter_peek_slot(trans->c, buf.buf);
-       }
-
-       return k;
-}
-
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter)
-{
-       if (!bch2_btree_iter_advance(trans, iter))
-               return bkey_s_c_null;
-
-       return bch2_btree_iter_peek_slot(trans, iter);
-}
-
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter)
-{
-       if (!bch2_btree_iter_rewind(trans, iter))
-               return bkey_s_c_null;
-
-       return bch2_btree_iter_peek_slot(trans, iter);
-}
-
-/* Obsolete, but still used by rust wrapper in -tools */
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter)
-{
-       struct bkey_s_c k;
-
-       while (btree_trans_too_many_iters(trans) ||
-              (k = bch2_btree_iter_peek_type(trans, iter, iter->flags),
-               bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
-               bch2_trans_begin(trans);
-
-       return k;
-}
-
-/* new transactional stuff: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
-{
-       struct btree_path *path;
-       unsigned i;
-
-       BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
-
-       trans_for_each_path(trans, path, i) {
-               BUG_ON(path->sorted_idx >= trans->nr_sorted);
-               BUG_ON(trans->sorted[path->sorted_idx] != i);
-       }
-
-       for (i = 0; i < trans->nr_sorted; i++) {
-               unsigned idx = trans->sorted[i];
-
-               BUG_ON(!test_bit(idx, trans->paths_allocated));
-               BUG_ON(trans->paths[idx].sorted_idx != i);
-       }
-}
-
-static void btree_trans_verify_sorted(struct btree_trans *trans)
-{
-       struct btree_path *path, *prev = NULL;
-       struct trans_for_each_path_inorder_iter iter;
-
-       if (!static_branch_unlikely(&bch2_debug_check_iterators))
-               return;
-
-       trans_for_each_path_inorder(trans, path, iter) {
-               if (prev && btree_path_cmp(prev, path) > 0) {
-                       __bch2_dump_trans_paths_updates(trans, true);
-                       panic("trans paths out of order!\n");
-               }
-               prev = path;
-       }
-}
-#else
-static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
-static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
-#endif
-
-void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
-{
-       int i, l = 0, r = trans->nr_sorted, inc = 1;
-       bool swapped;
-
-       btree_trans_verify_sorted_refs(trans);
-
-       if (trans->paths_sorted)
-               goto out;
-
-       /*
-        * Cocktail shaker sort: this is efficient because iterators will be
-        * mostly sorted.
-        */
-       do {
-               swapped = false;
-
-               for (i = inc > 0 ? l : r - 2;
-                    i + 1 < r && i >= l;
-                    i += inc) {
-                       if (btree_path_cmp(trans->paths + trans->sorted[i],
-                                          trans->paths + trans->sorted[i + 1]) > 0) {
-                               swap(trans->sorted[i], trans->sorted[i + 1]);
-                               trans->paths[trans->sorted[i]].sorted_idx = i;
-                               trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
-                               swapped = true;
-                       }
-               }
-
-               if (inc > 0)
-                       --r;
-               else
-                       l++;
-               inc = -inc;
-       } while (swapped);
-
-       trans->paths_sorted = true;
-out:
-       btree_trans_verify_sorted(trans);
-}
-
-static inline void btree_path_list_remove(struct btree_trans *trans,
-                                         struct btree_path *path)
-{
-       EBUG_ON(path->sorted_idx >= trans->nr_sorted);
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-       trans->nr_sorted--;
-       memmove_u64s_down_small(trans->sorted + path->sorted_idx,
-                               trans->sorted + path->sorted_idx + 1,
-                               DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
-                                            sizeof(u64) / sizeof(btree_path_idx_t)));
-#else
-       array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
-#endif
-       for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
-               trans->paths[trans->sorted[i]].sorted_idx = i;
-}
-
-static inline void btree_path_list_add(struct btree_trans *trans,
-                                      btree_path_idx_t pos,
-                                      btree_path_idx_t path_idx)
-{
-       struct btree_path *path = trans->paths + path_idx;
-
-       path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
-
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-       memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
-                             trans->sorted + path->sorted_idx,
-                             DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
-                                          sizeof(u64) / sizeof(btree_path_idx_t)));
-       trans->nr_sorted++;
-       trans->sorted[path->sorted_idx] = path_idx;
-#else
-       array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
-#endif
-
-       for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
-               trans->paths[trans->sorted[i]].sorted_idx = i;
-
-       btree_trans_verify_sorted_refs(trans);
-}
-
-void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
-{
-       if (iter->update_path)
-               bch2_path_put(trans, iter->update_path,
-                             iter->flags & BTREE_ITER_intent);
-       if (iter->path)
-               bch2_path_put(trans, iter->path,
-                             iter->flags & BTREE_ITER_intent);
-       if (iter->key_cache_path)
-               bch2_path_put(trans, iter->key_cache_path,
-                             iter->flags & BTREE_ITER_intent);
-       iter->path              = 0;
-       iter->update_path       = 0;
-       iter->key_cache_path    = 0;
-}
-
-void bch2_trans_iter_init_outlined(struct btree_trans *trans,
-                         struct btree_iter *iter,
-                         enum btree_id btree_id, struct bpos pos,
-                         unsigned flags)
-{
-       bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-                              bch2_btree_iter_flags(trans, btree_id, 0, flags),
-                              _RET_IP_);
-}
-
-void bch2_trans_node_iter_init(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              enum btree_id btree_id,
-                              struct bpos pos,
-                              unsigned locks_want,
-                              unsigned depth,
-                              unsigned flags)
-{
-       flags |= BTREE_ITER_not_extents;
-       flags |= BTREE_ITER_snapshot_field;
-       flags |= BTREE_ITER_all_snapshots;
-
-       if (!depth && btree_id_cached(trans->c, btree_id))
-               flags |= BTREE_ITER_with_key_cache;
-
-       bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
-                              bch2_btree_iter_flags(trans, btree_id, depth, flags),
-                              _RET_IP_);
-
-       iter->min_depth = depth;
-
-       struct btree_path *path = btree_iter_path(trans, iter);
-       BUG_ON(path->locks_want  < min(locks_want, BTREE_MAX_DEPTH));
-       BUG_ON(path->level      != depth);
-       BUG_ON(iter->min_depth  != depth);
-}
-
-void bch2_trans_copy_iter(struct btree_trans *trans,
-                         struct btree_iter *dst, struct btree_iter *src)
-{
-       *dst = *src;
-#ifdef TRACK_PATH_ALLOCATED
-       dst->ip_allocated = _RET_IP_;
-#endif
-       if (src->path)
-               __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
-       if (src->update_path)
-               __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
-       dst->key_cache_path = 0;
-}
-
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-void bch2_trans_kmalloc_trace_to_text(struct printbuf *out,
-                                     darray_trans_kmalloc_trace *trace)
-{
-       printbuf_tabstops_reset(out);
-       printbuf_tabstop_push(out, 60);
-
-       darray_for_each(*trace, i)
-               prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes);
-}
-#endif
-
-void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip)
-{
-       struct bch_fs *c = trans->c;
-       unsigned new_top = trans->mem_top + size;
-       unsigned old_bytes = trans->mem_bytes;
-       unsigned new_bytes = roundup_pow_of_two(new_top);
-       int ret;
-       void *new_mem;
-       void *p;
-
-       if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-               prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n",
-                          BTREE_TRANS_MEM_MAX);
-
-               bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
-               bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-#endif
-       }
-
-       ret = trans_maybe_inject_restart(trans, _RET_IP_);
-       if (ret)
-               return ERR_PTR(ret);
-
-       struct btree_transaction_stats *s = btree_trans_stats(trans);
-       if (new_bytes > s->max_mem) {
-               mutex_lock(&s->lock);
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-               darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr);
-               s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size,
-                                               trans->trans_kmalloc_trace.nr);
-
-               memcpy(s->trans_kmalloc_trace.data,
-                      trans->trans_kmalloc_trace.data,
-                      sizeof(s->trans_kmalloc_trace.data[0]) *
-                      s->trans_kmalloc_trace.nr);
-#endif
-               s->max_mem = new_bytes;
-               mutex_unlock(&s->lock);
-       }
-
-       if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) {
-               EBUG_ON(trans->mem_bytes >= new_bytes);
-               return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
-       }
-
-       if (old_bytes) {
-               trans->realloc_bytes_required = new_bytes;
-               trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
-               return ERR_PTR(btree_trans_restart_ip(trans,
-                                       BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
-       }
-
-       EBUG_ON(trans->mem);
-
-       new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
-       if (unlikely(!new_mem)) {
-               bch2_trans_unlock(trans);
-
-               new_mem = kmalloc(new_bytes, GFP_KERNEL);
-               if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
-                       new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
-                       new_bytes = BTREE_TRANS_MEM_MAX;
-                       trans->used_mempool = true;
-               }
-
-               EBUG_ON(!new_mem);
-
-               trans->mem = new_mem;
-               trans->mem_bytes = new_bytes;
-
-               ret = bch2_trans_relock(trans);
-               if (ret)
-                       return ERR_PTR(ret);
-       }
-
-       trans->mem = new_mem;
-       trans->mem_bytes = new_bytes;
-
-       p = trans->mem + trans->mem_top;
-       trans->mem_top += size;
-       memset(p, 0, size);
-       return p;
-}
-
-static inline void check_srcu_held_too_long(struct btree_trans *trans)
-{
-       WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
-            "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
-            (jiffies - trans->srcu_lock_time) / HZ);
-}
-
-void bch2_trans_srcu_unlock(struct btree_trans *trans)
-{
-       if (trans->srcu_held) {
-               struct bch_fs *c = trans->c;
-               struct btree_path *path;
-               unsigned i;
-
-               trans_for_each_path(trans, path, i)
-                       if (path->cached && !btree_node_locked(path, 0))
-                               path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
-
-               check_srcu_held_too_long(trans);
-               srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-               trans->srcu_held = false;
-       }
-}
-
-static void bch2_trans_srcu_lock(struct btree_trans *trans)
-{
-       if (!trans->srcu_held) {
-               trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
-               trans->srcu_lock_time   = jiffies;
-               trans->srcu_held = true;
-       }
-}
-
-/**
- * bch2_trans_begin() - reset a transaction after a interrupted attempt
- * @trans: transaction to reset
- *
- * Returns:    current restart counter, to be used with trans_was_restarted()
- *
- * While iterating over nodes or updating nodes a attempt to lock a btree node
- * may return BCH_ERR_transaction_restart when the trylock fails. When this
- * occurs bch2_trans_begin() should be called and the transaction retried.
- */
-u32 bch2_trans_begin(struct btree_trans *trans)
-{
-       struct btree_path *path;
-       unsigned i;
-       u64 now;
-
-       bch2_trans_reset_updates(trans);
-
-       trans->restart_count++;
-       trans->mem_top                  = 0;
-
-       if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) {
-               EBUG_ON(!trans->mem || !trans->mem_bytes);
-               unsigned new_bytes = trans->realloc_bytes_required;
-               void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
-               if (unlikely(!new_mem)) {
-                       bch2_trans_unlock(trans);
-                       new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
-
-                       EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX);
-
-                       if (!new_mem) {
-                               new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
-                               new_bytes = BTREE_TRANS_MEM_MAX;
-                               trans->used_mempool = true;
-                               kfree(trans->mem);
-                       }
-                }
-               trans->mem = new_mem;
-               trans->mem_bytes = new_bytes;
-       }
-
-       trans_for_each_path(trans, path, i) {
-               path->should_be_locked = false;
-
-               /*
-                * If the transaction wasn't restarted, we're presuming to be
-                * doing something new: dont keep iterators excpt the ones that
-                * are in use - except for the subvolumes btree:
-                */
-               if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
-                       path->preserve = false;
-
-               /*
-                * XXX: we probably shouldn't be doing this if the transaction
-                * was restarted, but currently we still overflow transaction
-                * iterators if we do that
-                */
-               if (!path->ref && !path->preserve)
-                       __bch2_path_free(trans, i);
-               else
-                       path->preserve = false;
-       }
-
-       now = local_clock();
-
-       if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
-           time_after64(now, trans->last_begin_time + 10))
-               __bch2_time_stats_update(&btree_trans_stats(trans)->duration,
-                                        trans->last_begin_time, now);
-
-       if (!trans->restarted &&
-           (need_resched() ||
-            time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
-               bch2_trans_unlock(trans);
-               cond_resched();
-               now = local_clock();
-       }
-       trans->last_begin_time = now;
-
-       if (unlikely(trans->srcu_held &&
-                    time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
-               bch2_trans_srcu_unlock(trans);
-
-       trans->last_begin_ip = _RET_IP_;
-
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
-       if (trans->restarted) {
-               trans->restart_count_this_trans++;
-       } else {
-               trans->restart_count_this_trans = 0;
-       }
-#endif
-
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-       trans->trans_kmalloc_trace.nr = 0;
-#endif
-
-       trans_set_locked(trans, false);
-
-       if (trans->restarted) {
-               bch2_btree_path_traverse_all(trans);
-               trans->notrace_relock_fail = false;
-       }
-
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-       return trans->restart_count;
-}
-
-const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
-
-unsigned bch2_trans_get_fn_idx(const char *fn)
-{
-       for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
-               if (!bch2_btree_transaction_fns[i] ||
-                   bch2_btree_transaction_fns[i] == fn) {
-                       bch2_btree_transaction_fns[i] = fn;
-                       return i;
-               }
-
-       pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
-       return 0;
-}
-
-struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
-       __acquires(&c->btree_trans_barrier)
-{
-       struct btree_trans *trans;
-
-       if (IS_ENABLED(__KERNEL__)) {
-               trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
-               if (trans) {
-                       memset(trans, 0, offsetof(struct btree_trans, list));
-                       goto got_trans;
-               }
-       }
-
-       trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
-       memset(trans, 0, sizeof(*trans));
-
-       seqmutex_lock(&c->btree_trans_lock);
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-               struct btree_trans *pos;
-               pid_t pid = current->pid;
-
-               trans->locking_wait.task = current;
-
-               list_for_each_entry(pos, &c->btree_trans_list, list) {
-                       struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
-                       /*
-                        * We'd much prefer to be stricter here and completely
-                        * disallow multiple btree_trans in the same thread -
-                        * but the data move path calls bch2_write when we
-                        * already have a btree_trans initialized.
-                        */
-                       BUG_ON(pos_task &&
-                              pid == pos_task->pid &&
-                              pos->locked);
-               }
-       }
-
-       list_add(&trans->list, &c->btree_trans_list);
-       seqmutex_unlock(&c->btree_trans_lock);
-got_trans:
-       trans->c                = c;
-       trans->last_begin_time  = local_clock();
-       trans->fn_idx           = fn_idx;
-       trans->locking_wait.task = current;
-       trans->journal_replay_not_finished =
-               unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
-               atomic_inc_not_zero(&c->journal_keys.ref);
-       trans->nr_paths         = ARRAY_SIZE(trans->_paths);
-       trans->paths_allocated  = trans->_paths_allocated;
-       trans->sorted           = trans->_sorted;
-       trans->paths            = trans->_paths;
-       trans->updates          = trans->_updates;
-
-       *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
-
-       trans->paths_allocated[0] = 1;
-
-       static struct lock_class_key lockdep_key;
-       lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0);
-
-       if (fn_idx < BCH_TRANSACTIONS_NR) {
-               trans->fn = bch2_btree_transaction_fns[fn_idx];
-
-               struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
-
-               if (s->max_mem) {
-                       unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
-
-                       trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
-                       if (likely(trans->mem))
-                               trans->mem_bytes = expected_mem_bytes;
-               }
-
-               trans->nr_paths_max = s->nr_max_paths;
-       }
-
-       trans->srcu_idx         = srcu_read_lock(&c->btree_trans_barrier);
-       trans->srcu_lock_time   = jiffies;
-       trans->srcu_held        = true;
-       trans_set_locked(trans, false);
-
-       closure_init_stack_release(&trans->ref);
-       return trans;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static bool btree_paths_leaked(struct btree_trans *trans)
-{
-       struct btree_path *path;
-       unsigned i;
-
-       trans_for_each_path(trans, path, i)
-               if (path->ref)
-                       return true;
-       return false;
-}
-
-static void check_btree_paths_leaked(struct btree_trans *trans)
-{
-       if (btree_paths_leaked(trans)) {
-               struct bch_fs *c = trans->c;
-               struct btree_path *path;
-               unsigned i;
-
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-
-               prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn);
-               trans_for_each_path(trans, path, i)
-                       if (path->ref)
-                               prt_printf(&buf, "btree %s %pS\n",
-                                          bch2_btree_id_str(path->btree_id),
-                                          (void *) path->ip_allocated);
-
-               bch2_fs_emergency_read_only2(c, &buf);
-               bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-       }
-}
-#else
-static inline void check_btree_paths_leaked(struct btree_trans *trans) {}
-#endif
-
-void bch2_trans_put(struct btree_trans *trans)
-       __releases(&c->btree_trans_barrier)
-{
-       struct bch_fs *c = trans->c;
-
-       if (trans->restarted)
-               bch2_trans_in_restart_error(trans);
-
-       bch2_trans_unlock(trans);
-
-       trans_for_each_update(trans, i)
-               __btree_path_put(trans, trans->paths + i->path, true);
-       trans->nr_updates       = 0;
-
-       check_btree_paths_leaked(trans);
-
-       if (trans->srcu_held) {
-               check_srcu_held_too_long(trans);
-               srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-       }
-
-       if (unlikely(trans->journal_replay_not_finished))
-               bch2_journal_keys_put(c);
-
-       /*
-        * trans->ref protects trans->locking_wait.task, btree_paths array; used
-        * by cycle detector
-        */
-       closure_return_sync(&trans->ref);
-       trans->locking_wait.task = NULL;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       darray_exit(&trans->last_restarted_trace);
-#endif
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-       darray_exit(&trans->trans_kmalloc_trace);
-#endif
-
-       unsigned long *paths_allocated = trans->paths_allocated;
-       trans->paths_allocated  = NULL;
-       trans->paths            = NULL;
-
-       if (paths_allocated != trans->_paths_allocated)
-               kvfree_rcu_mightsleep(paths_allocated);
-
-       if (trans->used_mempool)
-               mempool_free(trans->mem, &c->btree_trans_mem_pool);
-       else
-               kfree(trans->mem);
-
-       /* Userspace doesn't have a real percpu implementation: */
-       if (IS_ENABLED(__KERNEL__))
-               trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
-
-       if (trans) {
-               seqmutex_lock(&c->btree_trans_lock);
-               list_del(&trans->list);
-               seqmutex_unlock(&c->btree_trans_lock);
-
-               mempool_free(trans, &c->btree_trans_pool);
-       }
-}
-
-bool bch2_current_has_btree_trans(struct bch_fs *c)
-{
-       seqmutex_lock(&c->btree_trans_lock);
-       struct btree_trans *trans;
-       bool ret = false;
-       list_for_each_entry(trans, &c->btree_trans_list, list)
-               if (trans->locking_wait.task == current &&
-                   trans->locked) {
-                       ret = true;
-                       break;
-               }
-       seqmutex_unlock(&c->btree_trans_lock);
-       return ret;
-}
-
-static void __maybe_unused
-bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
-                                     struct btree_bkey_cached_common *b)
-{
-       struct six_lock_count c = six_lock_counts(&b->lock);
-       pid_t pid;
-
-       scoped_guard(rcu) {
-               struct task_struct *owner = READ_ONCE(b->lock.owner);
-               pid = owner ? owner->pid : 0;
-       }
-
-       prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
-       bch2_btree_id_to_text(out, b->btree_id);
-       prt_printf(out, " l=%u:", b->level);
-       bch2_bpos_to_text(out, btree_node_pos(b));
-
-       prt_printf(out, "\t locks %u:%u:%u held by pid %u",
-                  c.n[0], c.n[1], c.n[2], pid);
-}
-
-void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
-{
-       struct btree_bkey_cached_common *b;
-       static char lock_types[] = { 'r', 'i', 'w' };
-       struct task_struct *task = READ_ONCE(trans->locking_wait.task);
-       unsigned l, idx;
-
-       /* before rcu_read_lock(): */
-       bch2_printbuf_make_room(out, 4096);
-
-       if (!out->nr_tabstops) {
-               printbuf_tabstop_push(out, 16);
-               printbuf_tabstop_push(out, 32);
-       }
-
-       prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
-
-       /* trans->paths is rcu protected vs. freeing */
-       guard(rcu)();
-       out->atomic++;
-
-       struct btree_path *paths = rcu_dereference(trans->paths);
-       if (!paths)
-               goto out;
-
-       unsigned long *paths_allocated = trans_paths_allocated(paths);
-
-       trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
-               struct btree_path *path = paths + idx;
-               if (!path->nodes_locked)
-                       continue;
-
-               prt_printf(out, "  path %u %c ",
-                          idx,
-                          path->cached ? 'c' : 'b');
-               bch2_btree_id_to_text(out, path->btree_id);
-               prt_printf(out, " l=%u:", path->level);
-               bch2_bpos_to_text(out, path->pos);
-               prt_newline(out);
-
-               for (l = 0; l < BTREE_MAX_DEPTH; l++) {
-                       if (btree_node_locked(path, l) &&
-                           !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
-                               prt_printf(out, "    %c l=%u ",
-                                          lock_types[btree_node_locked_type(path, l)], l);
-                               bch2_btree_bkey_cached_common_to_text(out, b);
-                               prt_newline(out);
-                       }
-               }
-       }
-
-       b = READ_ONCE(trans->locking);
-       if (b) {
-               prt_printf(out, "  blocked for %lluus on\n",
-                          div_u64(local_clock() - trans->locking_wait.start_time, 1000));
-               prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
-               bch2_btree_bkey_cached_common_to_text(out, b);
-               prt_newline(out);
-       }
-out:
-       --out->atomic;
-}
-
-void bch2_fs_btree_iter_exit(struct bch_fs *c)
-{
-       struct btree_transaction_stats *s;
-       struct btree_trans *trans;
-       int cpu;
-
-       if (c->btree_trans_bufs)
-               for_each_possible_cpu(cpu) {
-                       struct btree_trans *trans =
-                               per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
-
-                       if (trans) {
-                               seqmutex_lock(&c->btree_trans_lock);
-                               list_del(&trans->list);
-                               seqmutex_unlock(&c->btree_trans_lock);
-                       }
-                       kfree(trans);
-               }
-       free_percpu(c->btree_trans_bufs);
-
-       trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
-       if (trans)
-               panic("%s leaked btree_trans\n", trans->fn);
-
-       for (s = c->btree_transaction_stats;
-            s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
-            s++) {
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-               darray_exit(&s->trans_kmalloc_trace);
-#endif
-               kfree(s->max_paths_text);
-               bch2_time_stats_exit(&s->lock_hold_times);
-       }
-
-       if (c->btree_trans_barrier_initialized) {
-               synchronize_srcu_expedited(&c->btree_trans_barrier);
-               cleanup_srcu_struct(&c->btree_trans_barrier);
-       }
-       mempool_exit(&c->btree_trans_mem_pool);
-       mempool_exit(&c->btree_trans_pool);
-}
-
-void bch2_fs_btree_iter_init_early(struct bch_fs *c)
-{
-       struct btree_transaction_stats *s;
-
-       for (s = c->btree_transaction_stats;
-            s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
-            s++) {
-               bch2_time_stats_init(&s->duration);
-               bch2_time_stats_init(&s->lock_hold_times);
-               mutex_init(&s->lock);
-       }
-
-       INIT_LIST_HEAD(&c->btree_trans_list);
-       seqmutex_init(&c->btree_trans_lock);
-}
-
-int bch2_fs_btree_iter_init(struct bch_fs *c)
-{
-       int ret;
-
-       c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
-       if (!c->btree_trans_bufs)
-               return -ENOMEM;
-
-       ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
-                                         sizeof(struct btree_trans)) ?:
-               mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
-                                         BTREE_TRANS_MEM_MAX) ?:
-               init_srcu_struct(&c->btree_trans_barrier);
-       if (ret)
-               return ret;
-
-       /*
-        * static annotation (hackily done) for lock ordering of reclaim vs.
-        * btree node locks:
-        */
-#ifdef CONFIG_LOCKDEP
-       fs_reclaim_acquire(GFP_KERNEL);
-       struct btree_trans *trans = bch2_trans_get(c);
-       trans_set_locked(trans, false);
-       bch2_trans_put(trans);
-       fs_reclaim_release(GFP_KERNEL);
-#endif
-
-       c->btree_trans_barrier_initialized = true;
-       return 0;
-
-}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
deleted file mode 100644 (file)
index 09dd3e5..0000000
+++ /dev/null
@@ -1,1010 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_ITER_H
-#define _BCACHEFS_BTREE_ITER_H
-
-#include "bset.h"
-#include "btree_types.h"
-#include "trace.h"
-
-void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
-void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
-void bch2_dump_trans_paths_updates(struct btree_trans *);
-
-static inline int __bkey_err(const struct bkey *k)
-{
-       return PTR_ERR_OR_ZERO(k);
-}
-
-#define bkey_err(_k)   __bkey_err((_k).k)
-
-static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
-{
-       unsigned idx = path - trans->paths;
-
-       EBUG_ON(idx >= trans->nr_paths);
-       EBUG_ON(!test_bit(idx, trans->paths_allocated));
-       if (unlikely(path->ref == U8_MAX)) {
-               bch2_dump_trans_paths_updates(trans);
-               panic("path %u refcount overflow\n", idx);
-       }
-
-       path->ref++;
-       path->intent_ref += intent;
-       trace_btree_path_get_ll(trans, path);
-}
-
-static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
-{
-       EBUG_ON(path - trans->paths >= trans->nr_paths);
-       EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
-       EBUG_ON(!path->ref);
-       EBUG_ON(!path->intent_ref && intent);
-
-       trace_btree_path_put_ll(trans, path);
-       path->intent_ref -= intent;
-       return --path->ref == 0;
-}
-
-static inline void btree_path_set_dirty(struct btree_trans *trans,
-                                       struct btree_path *path,
-                                       enum btree_path_uptodate u)
-{
-       BUG_ON(path->should_be_locked && trans->locked && !trans->restarted);
-       path->uptodate = max_t(unsigned, path->uptodate, u);
-}
-
-static inline struct btree *btree_path_node(struct btree_path *path,
-                                           unsigned level)
-{
-       return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
-}
-
-static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
-                                       const struct btree *b, unsigned level)
-{
-       return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
-}
-
-static inline struct btree *btree_node_parent(struct btree_path *path,
-                                             struct btree *b)
-{
-       return btree_path_node(path, b->c.level + 1);
-}
-
-/* Iterate over paths within a transaction: */
-
-void __bch2_btree_trans_sort_paths(struct btree_trans *);
-
-static inline void btree_trans_sort_paths(struct btree_trans *trans)
-{
-       if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-           trans->paths_sorted)
-               return;
-       __bch2_btree_trans_sort_paths(trans);
-}
-
-static inline unsigned long *trans_paths_nr(struct btree_path *paths)
-{
-       return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths;
-}
-
-static inline unsigned long *trans_paths_allocated(struct btree_path *paths)
-{
-       unsigned long *v = trans_paths_nr(paths);
-       return v - BITS_TO_LONGS(*v);
-}
-
-#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\
-       for (_idx = _start;                                             \
-            (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \
-            _idx++)
-
-static inline struct btree_path *
-__trans_next_path(struct btree_trans *trans, unsigned *idx)
-{
-       unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG;
-       /*
-        * Open coded find_next_bit(), because
-        *  - this is fast path, we can't afford the function call
-        *  - and we know that nr_paths is a multiple of BITS_PER_LONG,
-        */
-       while (*idx < trans->nr_paths) {
-               unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1));
-               if (v) {
-                       *idx += __ffs(v);
-                       return trans->paths + *idx;
-               }
-
-               *idx += BITS_PER_LONG;
-               *idx &= ~(BITS_PER_LONG - 1);
-               w++;
-       }
-
-       return NULL;
-}
-
-/*
- * This version is intended to be safe for use on a btree_trans that is owned by
- * another thread, for bch2_btree_trans_to_text();
- */
-#define trans_for_each_path_from(_trans, _path, _idx, _start)          \
-       for (_idx = _start;                                             \
-            (_path = __trans_next_path((_trans), &_idx));              \
-            _idx++)
-
-#define trans_for_each_path(_trans, _path, _idx)                       \
-       trans_for_each_path_from(_trans, _path, _idx, 1)
-
-static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
-{
-       unsigned idx = path ? path->sorted_idx + 1 : 0;
-
-       EBUG_ON(idx > trans->nr_sorted);
-
-       return idx < trans->nr_sorted
-               ? trans->paths + trans->sorted[idx]
-               : NULL;
-}
-
-static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
-{
-       unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
-
-       return idx
-               ? trans->paths + trans->sorted[idx - 1]
-               : NULL;
-}
-
-#define trans_for_each_path_idx_inorder(_trans, _iter)                 \
-       for (_iter = (struct trans_for_each_path_inorder_iter) { 0 };   \
-            (_iter.path_idx = trans->sorted[_iter.sorted_idx],         \
-             _iter.sorted_idx < (_trans)->nr_sorted);                  \
-            _iter.sorted_idx++)
-
-struct trans_for_each_path_inorder_iter {
-       btree_path_idx_t        sorted_idx;
-       btree_path_idx_t        path_idx;
-};
-
-#define trans_for_each_path_inorder(_trans, _path, _iter)              \
-       for (_iter = (struct trans_for_each_path_inorder_iter) { 0 };   \
-            (_iter.path_idx = trans->sorted[_iter.sorted_idx],         \
-             _path = (_trans)->paths + _iter.path_idx,                 \
-             _iter.sorted_idx < (_trans)->nr_sorted);                  \
-            _iter.sorted_idx++)
-
-#define trans_for_each_path_inorder_reverse(_trans, _path, _i)         \
-       for (_i = trans->nr_sorted - 1;                                 \
-            ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
-            --_i)
-
-static inline bool __path_has_node(const struct btree_path *path,
-                                  const struct btree *b)
-{
-       return path->l[b->c.level].b == b &&
-               btree_node_lock_seq_matches(path, b, b->c.level);
-}
-
-static inline struct btree_path *
-__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
-                           unsigned *idx)
-{
-       struct btree_path *path;
-
-       while ((path = __trans_next_path(trans, idx)) &&
-               !__path_has_node(path, b))
-              (*idx)++;
-
-       return path;
-}
-
-#define trans_for_each_path_with_node(_trans, _b, _path, _iter)                \
-       for (_iter = 1;                                                 \
-            (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\
-            _iter++)
-
-btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t,
-                                           bool, unsigned long);
-
-static inline btree_path_idx_t __must_check
-bch2_btree_path_make_mut(struct btree_trans *trans,
-                        btree_path_idx_t path, bool intent,
-                        unsigned long ip)
-{
-       if (trans->paths[path].ref > 1 ||
-           trans->paths[path].preserve)
-               path = __bch2_btree_path_make_mut(trans, path, intent, ip);
-       trans->paths[path].should_be_locked = false;
-       return path;
-}
-
-btree_path_idx_t __must_check
-__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t,
-                         struct bpos, bool, unsigned long);
-
-static inline btree_path_idx_t __must_check
-bch2_btree_path_set_pos(struct btree_trans *trans,
-                       btree_path_idx_t path, struct bpos new_pos,
-                       bool intent, unsigned long ip)
-{
-       return !bpos_eq(new_pos, trans->paths[path].pos)
-               ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip)
-               : path;
-}
-
-int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
-                                             btree_path_idx_t,
-                                             unsigned, unsigned long);
-
-static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *);
-
-static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
-                                         btree_path_idx_t path, unsigned flags)
-{
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-       if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
-               return 0;
-
-       return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
-}
-
-btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
-                                unsigned, unsigned, unsigned, unsigned long);
-btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id,
-                                           unsigned, struct bpos);
-
-struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
-
-/*
- * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
- * different snapshot:
- */
-static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
-{
-       struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
-
-       if (k.k && bpos_eq(path->pos, k.k->p))
-               return k;
-
-       bkey_init(u);
-       u->p = path->pos;
-       return (struct bkey_s_c) { u, NULL };
-}
-
-struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
-                                       struct btree_iter *, struct bpos);
-
-void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
-
-int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
-
-static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
-{
-       return mutex_trylock(lock)
-               ? 0
-               : __bch2_trans_mutex_lock(trans, lock);
-}
-
-/* Debug: */
-
-void __bch2_trans_verify_paths(struct btree_trans *);
-void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
-
-static inline void bch2_trans_verify_paths(struct btree_trans *trans)
-{
-       if (static_branch_unlikely(&bch2_debug_check_iterators))
-               __bch2_trans_verify_paths(trans);
-}
-
-static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree,
-                                         struct bpos pos)
-{
-       if (static_branch_unlikely(&bch2_debug_check_iterators))
-               __bch2_assert_pos_locked(trans, btree, pos);
-}
-
-void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
-                                     struct btree *, struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
-                             struct btree *, struct btree_node_iter *,
-                             struct bkey_packed *, unsigned, unsigned);
-
-int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
-
-void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool);
-
-int bch2_trans_relock(struct btree_trans *);
-int bch2_trans_relock_notrace(struct btree_trans *);
-void bch2_trans_unlock(struct btree_trans *);
-void bch2_trans_unlock_long(struct btree_trans *);
-
-static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
-{
-       return restart_count != trans->restart_count
-               ? -BCH_ERR_transaction_restart_nested
-               : 0;
-}
-
-void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
-
-static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
-                                                  u32 restart_count)
-{
-       if (trans_was_restarted(trans, restart_count))
-               bch2_trans_restart_error(trans, restart_count);
-}
-
-void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *);
-
-static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans)
-{
-       if (trans->restarted || !trans->locked)
-               bch2_trans_unlocked_or_in_restart_error(trans);
-}
-
-__always_inline
-static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip)
-{
-       BUG_ON(err <= 0);
-       BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
-
-       trans->restarted = err;
-       trans->last_restarted_ip = ip;
-       return -err;
-}
-
-__always_inline
-static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
-{
-       btree_trans_restart_foreign_task(trans, err, ip);
-#ifdef CONFIG_BCACHEFS_DEBUG
-       darray_exit(&trans->last_restarted_trace);
-       bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
-#endif
-       return -err;
-}
-
-__always_inline
-static int btree_trans_restart(struct btree_trans *trans, int err)
-{
-       return btree_trans_restart_ip(trans, err, _THIS_IP_);
-}
-
-static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip)
-{
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
-       if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) {
-               trace_and_count(trans->c, trans_restart_injected, trans, ip);
-               return btree_trans_restart_ip(trans,
-                                       BCH_ERR_transaction_restart_fault_inject, ip);
-       }
-#endif
-       return 0;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *,
-                            struct btree_path *, unsigned);
-
-void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
-
-static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
-                                            struct btree_path *path)
-{
-       unsigned new_locks_want = path->level + !!path->intent_ref;
-
-       if (path->locks_want > new_locks_want)
-               __bch2_btree_path_downgrade(trans, path, new_locks_want);
-}
-
-void bch2_trans_downgrade(struct btree_trans *);
-
-void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
-void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
-void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
-
-int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
-int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
-
-struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *);
-
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos);
-struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *);
-
-static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans,
-                                                  struct btree_iter *iter)
-{
-       return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos);
-
-static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter)
-{
-       return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN);
-}
-
-struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *);
-
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *);
-
-bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *);
-bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *);
-
-static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-       iter->k.type = KEY_TYPE_deleted;
-       iter->k.p.inode         = iter->pos.inode       = new_pos.inode;
-       iter->k.p.offset        = iter->pos.offset      = new_pos.offset;
-       iter->k.p.snapshot      = iter->pos.snapshot    = new_pos.snapshot;
-       iter->k.size = 0;
-}
-
-static inline void bch2_btree_iter_set_pos(struct btree_trans *trans,
-                                          struct btree_iter *iter, struct bpos new_pos)
-{
-       if (unlikely(iter->update_path))
-               bch2_path_put(trans, iter->update_path,
-                             iter->flags & BTREE_ITER_intent);
-       iter->update_path = 0;
-
-       if (!(iter->flags & BTREE_ITER_all_snapshots))
-               new_pos.snapshot = iter->snapshot;
-
-       __bch2_btree_iter_set_pos(iter, new_pos);
-}
-
-static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
-{
-       BUG_ON(!(iter->flags & BTREE_ITER_is_extents));
-       iter->pos = bkey_start_pos(&iter->k);
-}
-
-static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans,
-                                               struct btree_iter *iter, u32 snapshot)
-{
-       struct bpos pos = iter->pos;
-
-       iter->snapshot = snapshot;
-       pos.snapshot = snapshot;
-       bch2_btree_iter_set_pos(trans, iter, pos);
-}
-
-void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-
-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
-                                            unsigned btree_id,
-                                            unsigned level,
-                                            unsigned flags)
-{
-       if (level || !btree_id_cached(trans->c, btree_id)) {
-               flags &= ~BTREE_ITER_cached;
-               flags &= ~BTREE_ITER_with_key_cache;
-       } else if (!(flags & BTREE_ITER_cached))
-               flags |= BTREE_ITER_with_key_cache;
-
-       if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
-           btree_id_is_extents(btree_id))
-               flags |= BTREE_ITER_is_extents;
-
-       if (!(flags & BTREE_ITER_snapshot_field) &&
-           !btree_type_has_snapshot_field(btree_id))
-               flags &= ~BTREE_ITER_all_snapshots;
-
-       if (!(flags & BTREE_ITER_all_snapshots) &&
-           btree_type_has_snapshots(btree_id))
-               flags |= BTREE_ITER_filter_snapshots;
-
-       if (trans->journal_replay_not_finished)
-               flags |= BTREE_ITER_with_journal;
-
-       return flags;
-}
-
-static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
-                                         struct btree_iter *iter,
-                                         unsigned btree_id, struct bpos pos,
-                                         unsigned locks_want,
-                                         unsigned depth,
-                                         unsigned flags,
-                                         unsigned long ip)
-{
-       iter->update_path       = 0;
-       iter->key_cache_path    = 0;
-       iter->btree_id          = btree_id;
-       iter->min_depth         = 0;
-       iter->flags             = flags;
-       iter->snapshot          = pos.snapshot;
-       iter->pos               = pos;
-       iter->k                 = POS_KEY(pos);
-       iter->journal_idx       = 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
-       iter->ip_allocated = ip;
-#endif
-       iter->path = bch2_path_get(trans, btree_id, iter->pos,
-                                  locks_want, depth, flags, ip);
-}
-
-void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
-                         enum btree_id, struct bpos, unsigned);
-
-static inline void bch2_trans_iter_init(struct btree_trans *trans,
-                         struct btree_iter *iter,
-                         unsigned btree_id, struct bpos pos,
-                         unsigned flags)
-{
-       if (__builtin_constant_p(btree_id) &&
-           __builtin_constant_p(flags))
-               bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-                               bch2_btree_iter_flags(trans, btree_id, 0, flags),
-                               _THIS_IP_);
-       else
-               bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
-}
-
-void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
-                              enum btree_id, struct bpos,
-                              unsigned, unsigned, unsigned);
-void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *);
-
-void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *);
-
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-void bch2_trans_kmalloc_trace_to_text(struct printbuf *,
-                                     darray_trans_kmalloc_trace *);
-#endif
-
-void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long);
-
-static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size,
-                                           unsigned long ip)
-{
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-       darray_push(&trans->trans_kmalloc_trace,
-                   ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size }));
-#endif
-}
-
-static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size,
-                                                   unsigned long ip)
-{
-       size = roundup(size, 8);
-
-       bch2_trans_kmalloc_trace(trans, size, ip);
-
-       if (likely(trans->mem_top + size <= trans->mem_bytes)) {
-               void *p = trans->mem + trans->mem_top;
-
-               trans->mem_top += size;
-               return p;
-       } else {
-               return __bch2_trans_kmalloc(trans, size, ip);
-       }
-}
-
-static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size,
-                                         unsigned long ip)
-{
-       size = roundup(size, 8);
-
-       bch2_trans_kmalloc_trace(trans, size, ip);
-
-       if (likely(trans->mem_top + size <= trans->mem_bytes)) {
-               void *p = trans->mem + trans->mem_top;
-
-               trans->mem_top += size;
-               memset(p, 0, size);
-               return p;
-       } else {
-               return __bch2_trans_kmalloc(trans, size, ip);
-       }
-}
-
-/**
- * bch2_trans_kmalloc - allocate memory for use by the current transaction
- *
- * Must be called after bch2_trans_begin, which on second and further calls
- * frees all memory allocated in this transaction
- */
-static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
-{
-       return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_);
-}
-
-static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
-{
-       return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_);
-}
-
-static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
-                               struct btree_iter *iter,
-                               unsigned btree_id, struct bpos pos,
-                               unsigned flags, unsigned type)
-{
-       struct bkey_s_c k;
-
-       bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
-       k = bch2_btree_iter_peek_slot(trans, iter);
-
-       if (!bkey_err(k) && type && k.k->type != type)
-               k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
-       if (unlikely(bkey_err(k)))
-               bch2_trans_iter_exit(trans, iter);
-       return k;
-}
-
-static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
-                               struct btree_iter *iter,
-                               unsigned btree_id, struct bpos pos,
-                               unsigned flags)
-{
-       return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0);
-}
-
-#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
-       bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter,                 \
-                                      _btree_id, _pos, _flags, KEY_TYPE_##_type))
-
-static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k)
-{
-       unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k));
-       memcpy(dst_v, src_k.v, b);
-       if (unlikely(b < dst_size))
-               memset(dst_v + b, 0, dst_size - b);
-}
-
-#define bkey_val_copy(_dst_v, _src_k)                                  \
-do {                                                                   \
-       BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v));                 \
-       __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c);           \
-} while (0)
-
-static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
-                               unsigned btree_id, struct bpos pos,
-                               unsigned flags, unsigned type,
-                               unsigned val_size, void *val)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
-       int ret = bkey_err(k);
-       if (!ret) {
-               __bkey_val_copy(val, val_size, k);
-               bch2_trans_iter_exit(trans, &iter);
-       }
-
-       return ret;
-}
-
-#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\
-       __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags,      \
-                                 KEY_TYPE_##_type, sizeof(*_val), _val)
-
-void bch2_trans_srcu_unlock(struct btree_trans *);
-
-u32 bch2_trans_begin(struct btree_trans *);
-
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start,                        \
-                             _locks_want, _depth, _flags, _b, _do)             \
-({                                                                             \
-       bch2_trans_begin((_trans));                                             \
-                                                                               \
-       struct btree_iter _iter;                                                \
-       bch2_trans_node_iter_init((_trans), &_iter, (_btree_id),                \
-                                 _start, _locks_want, _depth, _flags);         \
-       int _ret3 = 0;                                                          \
-       do {                                                                    \
-               _ret3 = lockrestart_do((_trans), ({                             \
-                       struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\
-                       if (!_b)                                                \
-                               break;                                          \
-                                                                               \
-                       PTR_ERR_OR_ZERO(_b) ?: (_do);                           \
-               })) ?:                                                          \
-               lockrestart_do((_trans),                                        \
-                       PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\
-       } while (!_ret3);                                                       \
-                                                                               \
-       bch2_trans_iter_exit((_trans), &(_iter));                               \
-       _ret3;                                                                  \
-})
-
-#define for_each_btree_node(_trans, _iter, _btree_id, _start,          \
-                           _flags, _b, _do)                            \
-       __for_each_btree_node(_trans, _iter, _btree_id, _start, \
-                             0, 0, _flags, _b, _do)
-
-static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans,
-                                                            struct btree_iter *iter,
-                                                            unsigned flags)
-{
-       return  flags & BTREE_ITER_slots      ? bch2_btree_iter_peek_slot(trans, iter) :
-                                               bch2_btree_iter_peek_prev(trans, iter);
-}
-
-static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans,
-                                                       struct btree_iter *iter,
-                                                       unsigned flags)
-{
-       return  flags & BTREE_ITER_slots      ? bch2_btree_iter_peek_slot(trans, iter) :
-                                               bch2_btree_iter_peek(trans, iter);
-}
-
-static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans,
-                                                           struct btree_iter *iter,
-                                                           struct bpos end,
-                                                           unsigned flags)
-{
-       if (!(flags & BTREE_ITER_slots))
-               return bch2_btree_iter_peek_max(trans, iter, end);
-
-       if (bkey_gt(iter->pos, end))
-               return bkey_s_c_null;
-
-       return bch2_btree_iter_peek_slot(trans, iter);
-}
-
-int __bch2_btree_trans_too_many_iters(struct btree_trans *);
-
-static inline int btree_trans_too_many_iters(struct btree_trans *trans)
-{
-       if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
-               return __bch2_btree_trans_too_many_iters(trans);
-
-       return 0;
-}
-
-/*
- * goto instead of loop, so that when used inside for_each_btree_key2()
- * break/continue work correctly
- */
-#define lockrestart_do(_trans, _do)                                    \
-({                                                                     \
-       __label__ transaction_restart;                                  \
-       u32 _restart_count;                                             \
-       int _ret2;                                                      \
-transaction_restart:                                                   \
-       _restart_count = bch2_trans_begin(_trans);                      \
-       _ret2 = (_do);                                                  \
-                                                                       \
-       if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart))       \
-               goto transaction_restart;                               \
-                                                                       \
-       if (!_ret2)                                                     \
-               bch2_trans_verify_not_restarted(_trans, _restart_count);\
-       _ret2;                                                          \
-})
-
-/*
- * nested_lockrestart_do(), nested_commit_do():
- *
- * These are like lockrestart_do() and commit_do(), with two differences:
- *
- *  - We don't call bch2_trans_begin() unless we had a transaction restart
- *  - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
- *  transaction restart
- */
-#define nested_lockrestart_do(_trans, _do)                             \
-({                                                                     \
-       u32 _restart_count, _orig_restart_count;                        \
-       int _ret2;                                                      \
-                                                                       \
-       _restart_count = _orig_restart_count = (_trans)->restart_count; \
-                                                                       \
-       while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
-               _restart_count = bch2_trans_begin(_trans);              \
-                                                                       \
-       if (!_ret2)                                                     \
-               bch2_trans_verify_not_restarted(_trans, _restart_count);\
-                                                                       \
-       _ret2 ?: trans_was_restarted(_trans, _orig_restart_count);              \
-})
-
-#define for_each_btree_key_max_continue(_trans, _iter,                 \
-                                        _end, _flags, _k, _do)         \
-({                                                                     \
-       struct bkey_s_c _k;                                             \
-       int _ret3 = 0;                                                  \
-                                                                       \
-       do {                                                            \
-               _ret3 = lockrestart_do(_trans, ({                       \
-                       (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter),  \
-                                               _end, (_flags));        \
-                       if (!(_k).k)                                    \
-                               break;                                  \
-                                                                       \
-                       bkey_err(_k) ?: (_do);                          \
-               }));                                                    \
-       } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter)));  \
-                                                                       \
-       bch2_trans_iter_exit((_trans), &(_iter));                       \
-       _ret3;                                                          \
-})
-
-#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do)    \
-       for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
-
-#define for_each_btree_key_max(_trans, _iter, _btree_id,               \
-                               _start, _end, _flags, _k, _do)          \
-({                                                                     \
-       bch2_trans_begin(trans);                                        \
-                                                                       \
-       struct btree_iter _iter;                                        \
-       bch2_trans_iter_init((_trans), &(_iter), (_btree_id),           \
-                            (_start), (_flags));                       \
-                                                                       \
-       for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\
-})
-
-#define for_each_btree_key(_trans, _iter, _btree_id,                   \
-                          _start, _flags, _k, _do)                     \
-       for_each_btree_key_max(_trans, _iter, _btree_id, _start,        \
-                                SPOS_MAX, _flags, _k, _do)
-
-#define for_each_btree_key_reverse(_trans, _iter, _btree_id,           \
-                                  _start, _flags, _k, _do)             \
-({                                                                     \
-       struct btree_iter _iter;                                        \
-       struct bkey_s_c _k;                                             \
-       int _ret3 = 0;                                                  \
-                                                                       \
-       bch2_trans_iter_init((_trans), &(_iter), (_btree_id),           \
-                            (_start), (_flags));                       \
-                                                                       \
-       do {                                                            \
-               _ret3 = lockrestart_do(_trans, ({                       \
-                       (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \
-                                                       (_flags));      \
-                       if (!(_k).k)                                    \
-                               break;                                  \
-                                                                       \
-                       bkey_err(_k) ?: (_do);                          \
-               }));                                                    \
-       } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter)));   \
-                                                                       \
-       bch2_trans_iter_exit((_trans), &(_iter));                       \
-       _ret3;                                                          \
-})
-
-#define for_each_btree_key_commit(_trans, _iter, _btree_id,            \
-                                 _start, _iter_flags, _k,              \
-                                 _disk_res, _journal_seq, _commit_flags,\
-                                 _do)                                  \
-       for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
-                           (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
-                                       (_journal_seq), (_commit_flags)))
-
-#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id,    \
-                                 _start, _iter_flags, _k,              \
-                                 _disk_res, _journal_seq, _commit_flags,\
-                                 _do)                                  \
-       for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
-                           (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
-                                       (_journal_seq), (_commit_flags)))
-
-#define for_each_btree_key_max_commit(_trans, _iter, _btree_id,        \
-                                 _start, _end, _iter_flags, _k,        \
-                                 _disk_res, _journal_seq, _commit_flags,\
-                                 _do)                                  \
-       for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
-                           (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
-                                       (_journal_seq), (_commit_flags)))
-
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *,
-                                                         struct btree_iter *);
-
-#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id,     \
-                          _start, _end, _flags, _k, _ret)              \
-       for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),      \
-                                 (_start), (_flags));                  \
-            (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\
-            !((_ret) = bkey_err(_k)) && (_k).k;                        \
-            bch2_btree_iter_advance(_trans, &(_iter)))
-
-#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\
-       for (;                                                                  \
-            (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),      \
-            !((_ret) = bkey_err(_k)) && (_k).k;                                \
-            bch2_btree_iter_advance(_trans, &(_iter)))
-
-#define for_each_btree_key_norestart(_trans, _iter, _btree_id,         \
-                          _start, _flags, _k, _ret)                    \
-       for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
-                                         SPOS_MAX, _flags, _k, _ret)
-
-#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id,         \
-                                            _start, _flags, _k, _ret)          \
-       for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),              \
-                                 (_start), (_flags));                          \
-            (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags),   \
-            !((_ret) = bkey_err(_k)) && (_k).k;                                \
-            bch2_btree_iter_rewind(_trans, &(_iter)))
-
-#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \
-       for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret)
-
-/*
- * This should not be used in a fastpath, without first trying _do in
- * nonblocking mode - it will cause excessive transaction restarts and
- * potentially livelocking:
- */
-#define drop_locks_do(_trans, _do)                                     \
-({                                                                     \
-       bch2_trans_unlock(_trans);                                      \
-       (_do) ?: bch2_trans_relock(_trans);                             \
-})
-
-#define allocate_dropping_locks_errcode(_trans, _do)                   \
-({                                                                     \
-       gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;                           \
-       int _ret = _do;                                                 \
-                                                                       \
-       if (bch2_err_matches(_ret, ENOMEM)) {                           \
-               _gfp = GFP_KERNEL;                                      \
-               _ret = drop_locks_do(_trans, _do);                      \
-       }                                                               \
-       _ret;                                                           \
-})
-
-#define allocate_dropping_locks(_trans, _ret, _do)                     \
-({                                                                     \
-       gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;                           \
-       typeof(_do) _p = _do;                                           \
-                                                                       \
-       _ret = 0;                                                       \
-       if (unlikely(!_p)) {                                            \
-               _gfp = GFP_KERNEL;                                      \
-               _ret = drop_locks_do(_trans, ((_p = _do), 0));          \
-       }                                                               \
-       _p;                                                             \
-})
-
-struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
-void bch2_trans_put(struct btree_trans *);
-
-bool bch2_current_has_btree_trans(struct bch_fs *);
-
-extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
-unsigned bch2_trans_get_fn_idx(const char *);
-
-#define bch2_trans_get(_c)                                             \
-({                                                                     \
-       static unsigned trans_fn_idx;                                   \
-                                                                       \
-       if (unlikely(!trans_fn_idx))                                    \
-               trans_fn_idx = bch2_trans_get_fn_idx(__func__);         \
-       __bch2_trans_get(_c, trans_fn_idx);                             \
-})
-
-/*
- * We don't use DEFINE_CLASS() because using a function for the constructor
- * breaks bch2_trans_get()'s use of __func__
- */
-typedef struct btree_trans * class_btree_trans_t;
-static inline void class_btree_trans_destructor(struct btree_trans **p)
-{
-       struct btree_trans *trans = *p;
-       bch2_trans_put(trans);
-}
-
-#define class_btree_trans_constructor(_c)      bch2_trans_get(_c)
-
-#define bch2_trans_run(_c, _do)                                                \
-({                                                                     \
-       CLASS(btree_trans, trans)(_c);                                  \
-       (_do);                                                          \
-})
-
-#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
-
-void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
-
-void bch2_fs_btree_iter_exit(struct bch_fs *);
-void bch2_fs_btree_iter_init_early(struct bch_fs *);
-int bch2_fs_btree_iter_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
deleted file mode 100644 (file)
index ea83956..0000000
+++ /dev/null
@@ -1,830 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_cache.h"
-#include "btree_journal_iter.h"
-#include "journal_io.h"
-
-#include <linux/sort.h>
-
-/*
- * For managing keys we read from the journal: until journal replay works normal
- * btree lookups need to be able to find and return keys from the journal where
- * they overwrite what's in the btree, so we have a special iterator and
- * operations for the regular btree iter code to use:
- */
-
-static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
-{
-       size_t gap_size = keys->size - keys->nr;
-
-       BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
-
-       if (pos >= keys->gap)
-               pos -= gap_size;
-       return pos;
-}
-
-static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
-{
-       size_t gap_size = keys->size - keys->nr;
-
-       if (idx >= keys->gap)
-               idx += gap_size;
-       return idx;
-}
-
-static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
-{
-       return keys->data + idx_to_pos(keys, idx);
-}
-
-static size_t __bch2_journal_key_search(struct journal_keys *keys,
-                                       enum btree_id id, unsigned level,
-                                       struct bpos pos)
-{
-       size_t l = 0, r = keys->nr, m;
-
-       while (l < r) {
-               m = l + ((r - l) >> 1);
-               if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
-                       l = m + 1;
-               else
-                       r = m;
-       }
-
-       BUG_ON(l < keys->nr &&
-              __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
-
-       BUG_ON(l &&
-              __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
-
-       return l;
-}
-
-static size_t bch2_journal_key_search(struct journal_keys *keys,
-                                     enum btree_id id, unsigned level,
-                                     struct bpos pos)
-{
-       return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
-}
-
-/* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
-                                          unsigned level, struct bpos pos,
-                                          struct bpos end_pos, size_t *idx)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       unsigned iters = 0;
-       struct journal_key *k;
-
-       BUG_ON(*idx > keys->nr);
-search:
-       if (!*idx)
-               *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
-       while (*idx &&
-              __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
-               --(*idx);
-               iters++;
-               if (iters == 10) {
-                       *idx = 0;
-                       goto search;
-               }
-       }
-
-       struct bkey_i *ret = NULL;
-       rcu_read_lock(); /* for overwritten_ranges */
-
-       while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
-               if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
-                       break;
-
-               if (k->overwritten) {
-                       if (k->overwritten_range)
-                               *idx = rcu_dereference(k->overwritten_range)->end;
-                       else
-                               *idx += 1;
-                       continue;
-               }
-
-               if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
-                       ret = k->k;
-                       break;
-               }
-
-               (*idx)++;
-               iters++;
-               if (iters == 10) {
-                       *idx = 0;
-                       rcu_read_unlock();
-                       goto search;
-               }
-       }
-
-       rcu_read_unlock();
-       return ret;
-}
-
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
-                                          unsigned level, struct bpos pos,
-                                          struct bpos end_pos, size_t *idx)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       unsigned iters = 0;
-       struct journal_key *k;
-
-       BUG_ON(*idx > keys->nr);
-
-       if (!keys->nr)
-               return NULL;
-search:
-       if (!*idx)
-               *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
-       while (*idx < keys->nr &&
-              __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
-               (*idx)++;
-               iters++;
-               if (iters == 10) {
-                       *idx = 0;
-                       goto search;
-               }
-       }
-
-       if (*idx == keys->nr)
-               --(*idx);
-
-       struct bkey_i *ret = NULL;
-       rcu_read_lock(); /* for overwritten_ranges */
-
-       while (true) {
-               k = idx_to_key(keys, *idx);
-               if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
-                       break;
-
-               if (k->overwritten) {
-                       if (k->overwritten_range)
-                               *idx = rcu_dereference(k->overwritten_range)->start;
-                       if (!*idx)
-                               break;
-                       --(*idx);
-                       continue;
-               }
-
-               if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
-                       ret = k->k;
-                       break;
-               }
-
-               if (!*idx)
-                       break;
-               --(*idx);
-               iters++;
-               if (iters == 10) {
-                       *idx = 0;
-                       goto search;
-               }
-       }
-
-       rcu_read_unlock();
-       return ret;
-}
-
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
-                                          unsigned level, struct bpos pos)
-{
-       size_t idx = 0;
-
-       return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx);
-}
-
-static void journal_iter_verify(struct journal_iter *iter)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       struct journal_keys *keys = iter->keys;
-       size_t gap_size = keys->size - keys->nr;
-
-       BUG_ON(iter->idx >= keys->gap &&
-              iter->idx <  keys->gap + gap_size);
-
-       if (iter->idx < keys->size) {
-               struct journal_key *k = keys->data + iter->idx;
-
-               int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
-               BUG_ON(cmp > 0);
-       }
-#endif
-}
-
-static void journal_iters_fix(struct bch_fs *c)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       /* The key we just inserted is immediately before the gap: */
-       size_t gap_end = keys->gap + (keys->size - keys->nr);
-       struct journal_key *new_key = &keys->data[keys->gap - 1];
-       struct journal_iter *iter;
-
-       /*
-        * If an iterator points one after the key we just inserted, decrement
-        * the iterator so it points at the key we just inserted - if the
-        * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
-        * handle that:
-        */
-       list_for_each_entry(iter, &c->journal_iters, list) {
-               journal_iter_verify(iter);
-               if (iter->idx           == gap_end &&
-                   new_key->btree_id   == iter->btree_id &&
-                   new_key->level      == iter->level)
-                       iter->idx = keys->gap - 1;
-               journal_iter_verify(iter);
-       }
-}
-
-static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       struct journal_iter *iter;
-       size_t gap_size = keys->size - keys->nr;
-
-       list_for_each_entry(iter, &c->journal_iters, list) {
-               if (iter->idx > old_gap)
-                       iter->idx -= gap_size;
-               if (iter->idx >= new_gap)
-                       iter->idx += gap_size;
-       }
-}
-
-int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
-                                unsigned level, struct bkey_i *k)
-{
-       struct journal_key n = {
-               .btree_id       = id,
-               .level          = level,
-               .k              = k,
-               .allocated      = true,
-               /*
-                * Ensure these keys are done last by journal replay, to unblock
-                * journal reclaim:
-                */
-               .journal_seq    = U64_MAX,
-       };
-       struct journal_keys *keys = &c->journal_keys;
-       size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-
-       BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
-       if (idx < keys->size &&
-           journal_key_cmp(&n, &keys->data[idx]) == 0) {
-               if (keys->data[idx].allocated)
-                       kfree(keys->data[idx].k);
-               keys->data[idx] = n;
-               return 0;
-       }
-
-       if (idx > keys->gap)
-               idx -= keys->size - keys->nr;
-
-       size_t old_gap = keys->gap;
-
-       if (keys->nr == keys->size) {
-               journal_iters_move_gap(c, old_gap, keys->size);
-               old_gap = keys->size;
-
-               struct journal_keys new_keys = {
-                       .nr                     = keys->nr,
-                       .size                   = max_t(size_t, keys->size, 8) * 2,
-               };
-
-               new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL);
-               if (!new_keys.data) {
-                       bch_err(c, "%s: error allocating new key array (size %zu)",
-                               __func__, new_keys.size);
-                       return bch_err_throw(c, ENOMEM_journal_key_insert);
-               }
-
-               /* Since @keys was full, there was no gap: */
-               memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
-               kvfree(keys->data);
-               keys->data      = new_keys.data;
-               keys->nr        = new_keys.nr;
-               keys->size      = new_keys.size;
-
-               /* And now the gap is at the end: */
-               keys->gap       = keys->nr;
-       }
-
-       journal_iters_move_gap(c, old_gap, idx);
-
-       move_gap(keys, idx);
-
-       keys->nr++;
-       keys->data[keys->gap++] = n;
-
-       journal_iters_fix(c);
-
-       return 0;
-}
-
-/*
- * Can only be used from the recovery thread while we're still RO - can't be
- * used once we've got RW, as journal_keys is at that point used by multiple
- * threads:
- */
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
-                           unsigned level, struct bkey_i *k)
-{
-       struct bkey_i *n;
-       int ret;
-
-       n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
-       if (!n)
-               return bch_err_throw(c, ENOMEM_journal_key_insert);
-
-       bkey_copy(n, k);
-       ret = bch2_journal_key_insert_take(c, id, level, n);
-       if (ret)
-               kfree(n);
-       return ret;
-}
-
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
-                           unsigned level, struct bpos pos)
-{
-       struct bkey_i whiteout;
-
-       bkey_init(&whiteout.k);
-       whiteout.k.p = pos;
-
-       return bch2_journal_key_insert(c, id, level, &whiteout);
-}
-
-bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
-                                unsigned level, struct bpos pos)
-{
-       struct journal_keys *keys = &trans->c->journal_keys;
-       size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
-       if (!trans->journal_replay_not_finished)
-               return false;
-
-       return (idx < keys->size &&
-               keys->data[idx].btree_id        == btree &&
-               keys->data[idx].level           == level &&
-               bpos_eq(keys->data[idx].k->k.p, pos) &&
-               bkey_deleted(&keys->data[idx].k->k));
-}
-
-static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
-{
-       struct journal_key *k = keys->data + pos;
-       size_t idx = pos_to_idx(keys, pos);
-
-       k->overwritten = true;
-
-       struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
-       struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
-
-       bool prev_overwritten = prev && prev->overwritten;
-       bool next_overwritten = next && next->overwritten;
-
-       struct journal_key_range_overwritten *prev_range =
-               prev_overwritten ? prev->overwritten_range : NULL;
-       struct journal_key_range_overwritten *next_range =
-               next_overwritten ? next->overwritten_range : NULL;
-
-       BUG_ON(prev_range && prev_range->end != idx);
-       BUG_ON(next_range && next_range->start != idx + 1);
-
-       if (prev_range && next_range) {
-               prev_range->end = next_range->end;
-
-               keys->data[pos].overwritten_range = prev_range;
-               for (size_t i = next_range->start; i < next_range->end; i++) {
-                       struct journal_key *ip = keys->data + idx_to_pos(keys, i);
-                       BUG_ON(ip->overwritten_range != next_range);
-                       ip->overwritten_range = prev_range;
-               }
-
-               kfree_rcu_mightsleep(next_range);
-       } else if (prev_range) {
-               prev_range->end++;
-               k->overwritten_range = prev_range;
-               if (next_overwritten) {
-                       prev_range->end++;
-                       next->overwritten_range = prev_range;
-               }
-       } else if (next_range) {
-               next_range->start--;
-               k->overwritten_range = next_range;
-               if (prev_overwritten) {
-                       next_range->start--;
-                       prev->overwritten_range = next_range;
-               }
-       } else if (prev_overwritten || next_overwritten) {
-               struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
-               if (!r)
-                       return;
-
-               r->start = idx - (size_t) prev_overwritten;
-               r->end = idx + 1 + (size_t) next_overwritten;
-
-               rcu_assign_pointer(k->overwritten_range, r);
-               if (prev_overwritten)
-                       prev->overwritten_range = r;
-               if (next_overwritten)
-                       next->overwritten_range = r;
-       }
-}
-
-void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
-                                 unsigned level, struct bpos pos)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
-       if (idx < keys->size &&
-           keys->data[idx].btree_id    == btree &&
-           keys->data[idx].level       == level &&
-           bpos_eq(keys->data[idx].k->k.p, pos) &&
-           !keys->data[idx].overwritten) {
-               mutex_lock(&keys->overwrite_lock);
-               __bch2_journal_key_overwritten(keys, idx);
-               mutex_unlock(&keys->overwrite_lock);
-       }
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
-       if (iter->idx < iter->keys->size) {
-               iter->idx++;
-               if (iter->idx == iter->keys->gap)
-                       iter->idx += iter->keys->size - iter->keys->nr;
-       }
-}
-
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
-{
-       journal_iter_verify(iter);
-
-       guard(rcu)();
-       while (iter->idx < iter->keys->size) {
-               struct journal_key *k = iter->keys->data + iter->idx;
-
-               int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
-               if (cmp < 0)
-                       break;
-               BUG_ON(cmp);
-
-               if (!k->overwritten)
-                       return bkey_i_to_s_c(k->k);
-
-               if (k->overwritten_range)
-                       iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
-               else
-                       bch2_journal_iter_advance(iter);
-       }
-
-       return bkey_s_c_null;
-}
-
-static void bch2_journal_iter_exit(struct journal_iter *iter)
-{
-       list_del(&iter->list);
-}
-
-static void bch2_journal_iter_init(struct bch_fs *c,
-                                  struct journal_iter *iter,
-                                  enum btree_id id, unsigned level,
-                                  struct bpos pos)
-{
-       iter->btree_id  = id;
-       iter->level     = level;
-       iter->keys      = &c->journal_keys;
-       iter->idx       = bch2_journal_key_search(&c->journal_keys, id, level, pos);
-
-       journal_iter_verify(iter);
-}
-
-static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
-{
-       return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
-                                               iter->b, &iter->unpacked);
-}
-
-static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
-{
-       bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
-}
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
-{
-       if (bpos_eq(iter->pos, SPOS_MAX))
-               iter->at_end = true;
-       else
-               iter->pos = bpos_successor(iter->pos);
-}
-
-static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
-{
-       struct btree_and_journal_iter iter = *_iter;
-       struct bch_fs *c = iter.trans->c;
-       unsigned level = iter.journal.level;
-       struct bkey_buf tmp;
-       unsigned nr = test_bit(BCH_FS_started, &c->flags)
-               ? (level > 1 ? 0 :  2)
-               : (level > 1 ? 1 : 16);
-
-       iter.prefetch = false;
-       iter.fail_if_too_many_whiteouts = true;
-       bch2_bkey_buf_init(&tmp);
-
-       while (nr--) {
-               bch2_btree_and_journal_iter_advance(&iter);
-               struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
-               if (!k.k)
-                       break;
-
-               bch2_bkey_buf_reassemble(&tmp, c, k);
-               bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
-       }
-
-       bch2_bkey_buf_exit(&tmp, c);
-}
-
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
-{
-       struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
-       size_t iters = 0;
-
-       if (iter->prefetch && iter->journal.level)
-               btree_and_journal_iter_prefetch(iter);
-again:
-       if (iter->at_end)
-               return bkey_s_c_null;
-
-       iters++;
-
-       if (iters > 20 && iter->fail_if_too_many_whiteouts)
-               return bkey_s_c_null;
-
-       while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
-              bpos_lt(btree_k.k->p, iter->pos))
-               bch2_journal_iter_advance_btree(iter);
-
-       if (iter->trans->journal_replay_not_finished)
-               while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
-                      bpos_lt(journal_k.k->p, iter->pos))
-                       bch2_journal_iter_advance(&iter->journal);
-
-       ret = journal_k.k &&
-               (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
-               ? journal_k
-               : btree_k;
-
-       if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
-               ret = bkey_s_c_null;
-
-       if (ret.k) {
-               iter->pos = ret.k->p;
-               if (bkey_deleted(ret.k)) {
-                       bch2_btree_and_journal_iter_advance(iter);
-                       goto again;
-               }
-       } else {
-               iter->pos = SPOS_MAX;
-               iter->at_end = true;
-       }
-
-       return ret;
-}
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
-{
-       bch2_journal_iter_exit(&iter->journal);
-}
-
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
-                                                 struct btree_and_journal_iter *iter,
-                                                 struct btree *b,
-                                                 struct btree_node_iter node_iter,
-                                                 struct bpos pos)
-{
-       memset(iter, 0, sizeof(*iter));
-
-       iter->trans = trans;
-       iter->b = b;
-       iter->node_iter = node_iter;
-       iter->pos = b->data->min_key;
-       iter->at_end = false;
-       INIT_LIST_HEAD(&iter->journal.list);
-
-       if (trans->journal_replay_not_finished) {
-               bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
-               if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
-                       list_add(&iter->journal.list, &trans->c->journal_iters);
-       }
-}
-
-/*
- * this version is used by btree_gc before filesystem has gone RW and
- * multithreaded, so uses the journal_iters list:
- */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
-                                               struct btree_and_journal_iter *iter,
-                                               struct btree *b)
-{
-       struct btree_node_iter node_iter;
-
-       bch2_btree_node_iter_init_from_start(&node_iter, b);
-       __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
-}
-
-/* sort and dedup all keys in the journal: */
-
-/*
- * When keys compare equal, oldest compares first:
- */
-static int journal_sort_key_cmp(const void *_l, const void *_r)
-{
-       const struct journal_key *l = _l;
-       const struct journal_key *r = _r;
-       int rewind = l->rewind && r->rewind ? -1 : 1;
-
-       return  journal_key_cmp(l, r) ?:
-               ((cmp_int(l->journal_seq, r->journal_seq) ?:
-                 cmp_int(l->journal_offset, r->journal_offset)) * rewind);
-}
-
-void bch2_journal_keys_put(struct bch_fs *c)
-{
-       struct journal_keys *keys = &c->journal_keys;
-
-       BUG_ON(atomic_read(&keys->ref) <= 0);
-
-       if (!atomic_dec_and_test(&keys->ref))
-               return;
-
-       move_gap(keys, keys->nr);
-
-       darray_for_each(*keys, i) {
-               if (i->overwritten_range &&
-                   (i == &darray_last(*keys) ||
-                    i->overwritten_range != i[1].overwritten_range))
-                       kfree(i->overwritten_range);
-
-               if (i->allocated)
-                       kfree(i->k);
-       }
-
-       kvfree(keys->data);
-       keys->data = NULL;
-       keys->nr = keys->gap = keys->size = 0;
-
-       struct journal_replay **i;
-       struct genradix_iter iter;
-
-       genradix_for_each(&c->journal_entries, iter, i)
-               kvfree(*i);
-       genradix_free(&c->journal_entries);
-}
-
-static void __journal_keys_sort(struct journal_keys *keys)
-{
-       sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
-                      journal_sort_key_cmp, NULL);
-
-       cond_resched();
-
-       struct journal_key *dst = keys->data;
-
-       darray_for_each(*keys, src) {
-               /*
-                * We don't accumulate accounting keys here because we have to
-                * compare each individual accounting key against the version in
-                * the btree during replay:
-                */
-               if (src->k->k.type != KEY_TYPE_accounting &&
-                   src + 1 < &darray_top(*keys) &&
-                   !journal_key_cmp(src, src + 1))
-                       continue;
-
-               *dst++ = *src;
-       }
-
-       keys->nr = dst - keys->data;
-}
-
-int bch2_journal_keys_sort(struct bch_fs *c)
-{
-       struct genradix_iter iter;
-       struct journal_replay *i, **_i;
-       struct journal_keys *keys = &c->journal_keys;
-       size_t nr_read = 0;
-
-       u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX;
-
-       genradix_for_each(&c->journal_entries, iter, _i) {
-               i = *_i;
-
-               if (journal_replay_ignore(i))
-                       continue;
-
-               cond_resched();
-
-               vstruct_for_each(&i->j, entry) {
-                       bool rewind = !entry->level &&
-                               !btree_id_is_alloc(entry->btree_id) &&
-                               le64_to_cpu(i->j.seq) >= rewind_seq;
-
-                       if (entry->type != (rewind
-                                           ? BCH_JSET_ENTRY_overwrite
-                                           : BCH_JSET_ENTRY_btree_keys))
-                               continue;
-
-                       if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start)
-                               continue;
-
-                       jset_entry_for_each_key(entry, k) {
-                               struct journal_key n = (struct journal_key) {
-                                       .btree_id       = entry->btree_id,
-                                       .level          = entry->level,
-                                       .rewind         = rewind,
-                                       .k              = k,
-                                       .journal_seq    = le64_to_cpu(i->j.seq),
-                                       .journal_offset = k->_data - i->j._data,
-                               };
-
-                               if (darray_push(keys, n)) {
-                                       __journal_keys_sort(keys);
-
-                                       if (keys->nr * 8 > keys->size * 7) {
-                                               bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
-                                                       keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
-                                               return bch_err_throw(c, ENOMEM_journal_keys_sort);
-                                       }
-
-                                       BUG_ON(darray_push(keys, n));
-                               }
-
-                               nr_read++;
-                       }
-               }
-       }
-
-       __journal_keys_sort(keys);
-       keys->gap = keys->nr;
-
-       bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
-       return 0;
-}
-
-void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
-                                 unsigned level_min, unsigned level_max,
-                                 struct bpos start, struct bpos end)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       size_t dst = 0;
-
-       move_gap(keys, keys->nr);
-
-       darray_for_each(*keys, i)
-               if (!(i->btree_id == btree &&
-                     i->level >= level_min &&
-                     i->level <= level_max &&
-                     bpos_ge(i->k->k.p, start) &&
-                     bpos_le(i->k->k.p, end)))
-                       keys->data[dst++] = *i;
-       keys->nr = keys->gap = dst;
-}
-
-void bch2_journal_keys_dump(struct bch_fs *c)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       struct printbuf buf = PRINTBUF;
-
-       pr_info("%zu keys:", keys->nr);
-
-       move_gap(keys, keys->nr);
-
-       darray_for_each(*keys, i) {
-               printbuf_reset(&buf);
-               prt_printf(&buf, "btree=");
-               bch2_btree_id_to_text(&buf, i->btree_id);
-               prt_printf(&buf, " l=%u ", i->level);
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-               pr_err("%s", buf.buf);
-       }
-       printbuf_exit(&buf);
-}
-
-void bch2_fs_journal_keys_init(struct bch_fs *c)
-{
-       struct journal_keys *keys = &c->journal_keys;
-
-       atomic_set(&keys->ref, 1);
-       keys->initial_ref_held = true;
-       mutex_init(&keys->overwrite_lock);
-}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
deleted file mode 100644 (file)
index 2a30829..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
-#define _BCACHEFS_BTREE_JOURNAL_ITER_H
-
-#include "bkey.h"
-
-struct journal_iter {
-       struct list_head        list;
-       enum btree_id           btree_id;
-       unsigned                level;
-       size_t                  idx;
-       struct journal_keys     *keys;
-};
-
-/*
- * Iterate over keys in the btree, with keys from the journal overlaid on top:
- */
-
-struct btree_and_journal_iter {
-       struct btree_trans      *trans;
-       struct btree            *b;
-       struct btree_node_iter  node_iter;
-       struct bkey             unpacked;
-
-       struct journal_iter     journal;
-       struct bpos             pos;
-       bool                    at_end;
-       bool                    prefetch;
-       bool                    fail_if_too_many_whiteouts;
-};
-
-static inline int __journal_key_btree_cmp(enum btree_id        l_btree_id,
-                                         unsigned      l_level,
-                                         const struct journal_key *r)
-{
-       return -cmp_int(l_level,        r->level) ?:
-               cmp_int(l_btree_id,     r->btree_id);
-}
-
-static inline int __journal_key_cmp(enum btree_id      l_btree_id,
-                                   unsigned            l_level,
-                                   struct bpos l_pos,
-                                   const struct journal_key *r)
-{
-       return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
-               bpos_cmp(l_pos, r->k->k.p);
-}
-
-static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
-       return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
-                               unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
-                               unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
-                                          unsigned, struct bpos);
-
-int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
-                                        struct btree_and_journal_iter *);
-
-int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
-                                unsigned, struct bkey_i *);
-int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
-                           unsigned, struct bkey_i *);
-int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
-                           unsigned, struct bpos);
-bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
-                               struct btree_and_journal_iter *, struct btree *,
-                               struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
-                               struct btree_and_journal_iter *, struct btree *);
-
-void bch2_journal_keys_put(struct bch_fs *);
-
-static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
-{
-       if (c->journal_keys.initial_ref_held)
-               bch2_journal_keys_put(c);
-       c->journal_keys.initial_ref_held = false;
-}
-
-int bch2_journal_keys_sort(struct bch_fs *);
-
-void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
-                                 unsigned, unsigned,
-                                 struct bpos, struct bpos);
-
-void bch2_journal_keys_dump(struct bch_fs *);
-
-void bch2_fs_journal_keys_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
deleted file mode 100644 (file)
index 86aacb2..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
-#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
-
-struct journal_key_range_overwritten {
-       size_t                  start, end;
-};
-
-struct journal_key {
-       u64                     journal_seq;
-       u32                     journal_offset;
-       enum btree_id           btree_id:8;
-       unsigned                level:8;
-       bool                    allocated:1;
-       bool                    overwritten:1;
-       bool                    rewind:1;
-       struct journal_key_range_overwritten __rcu *
-                               overwritten_range;
-       struct bkey_i           *k;
-};
-
-struct journal_keys {
-       /* must match layout in darray_types.h */
-       size_t                  nr, size;
-       struct journal_key      *data;
-       /*
-        * Gap buffer: instead of all the empty space in the array being at the
-        * end of the buffer - from @nr to @size - the empty space is at @gap.
-        * This means that sequential insertions are O(n) instead of O(n^2).
-        */
-       size_t                  gap;
-       atomic_t                ref;
-       bool                    initial_ref_held;
-       struct mutex            overwrite_lock;
-};
-
-#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
deleted file mode 100644 (file)
index d96188b..0000000
+++ /dev/null
@@ -1,880 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-static inline bool btree_uses_pcpu_readers(enum btree_id id)
-{
-       return id == BTREE_ID_subvolumes;
-}
-
-static struct kmem_cache *bch2_key_cache;
-
-static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
-                                      const void *obj)
-{
-       const struct bkey_cached *ck = obj;
-       const struct bkey_cached_key *key = arg->key;
-
-       return ck->key.btree_id != key->btree_id ||
-               !bpos_eq(ck->key.pos, key->pos);
-}
-
-static const struct rhashtable_params bch2_btree_key_cache_params = {
-       .head_offset            = offsetof(struct bkey_cached, hash),
-       .key_offset             = offsetof(struct bkey_cached, key),
-       .key_len                = sizeof(struct bkey_cached_key),
-       .obj_cmpfn              = bch2_btree_key_cache_cmp_fn,
-       .automatic_shrinking    = true,
-};
-
-static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path,
-                                        struct bkey_cached *ck,
-                                        enum btree_node_locked_type lock_held)
-{
-       path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
-       path->l[0].b            = (void *) ck;
-       mark_btree_node_locked(trans, path, 0, lock_held);
-}
-
-__flatten
-inline struct bkey_cached *
-bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
-{
-       struct bkey_cached_key key = {
-               .btree_id       = btree_id,
-               .pos            = pos,
-       };
-
-       return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
-                                     bch2_btree_key_cache_params);
-}
-
-static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
-{
-       if (!six_trylock_intent(&ck->c.lock))
-               return false;
-
-       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-               six_unlock_intent(&ck->c.lock);
-               return false;
-       }
-
-       if (!six_trylock_write(&ck->c.lock)) {
-               six_unlock_intent(&ck->c.lock);
-               return false;
-       }
-
-       return true;
-}
-
-static bool bkey_cached_evict(struct btree_key_cache *c,
-                             struct bkey_cached *ck)
-{
-       bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
-                                     bch2_btree_key_cache_params);
-       if (ret) {
-               memset(&ck->key, ~0, sizeof(ck->key));
-               atomic_long_dec(&c->nr_keys);
-       }
-
-       return ret;
-}
-
-static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
-{
-       struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
-       struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
-
-       this_cpu_dec(*c->btree_key_cache.nr_pending);
-       kmem_cache_free(bch2_key_cache, ck);
-}
-
-static inline void bkey_cached_free_noassert(struct btree_key_cache *bc,
-                                     struct bkey_cached *ck)
-{
-       kfree(ck->k);
-       ck->k           = NULL;
-       ck->u64s        = 0;
-
-       six_unlock_write(&ck->c.lock);
-       six_unlock_intent(&ck->c.lock);
-
-       bool pcpu_readers = ck->c.lock.readers != NULL;
-       rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
-       this_cpu_inc(*bc->nr_pending);
-}
-
-static void bkey_cached_free(struct btree_trans *trans,
-                            struct btree_key_cache *bc,
-                            struct bkey_cached *ck)
-{
-       /*
-        * we'll hit strange issues in the SRCU code if we aren't holding an
-        * SRCU read lock...
-        */
-       EBUG_ON(!trans->srcu_held);
-
-       bkey_cached_free_noassert(bc, ck);
-}
-
-static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
-{
-       gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
-
-       struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
-       if (unlikely(!ck))
-               return NULL;
-       ck->k = kmalloc(key_u64s * sizeof(u64), gfp);
-       if (unlikely(!ck->k)) {
-               kmem_cache_free(bch2_key_cache, ck);
-               return NULL;
-       }
-       ck->u64s = key_u64s;
-       return ck;
-}
-
-static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_key_cache *bc = &c->btree_key_cache;
-       bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
-       int ret;
-
-       struct bkey_cached *ck = container_of_or_null(
-                               rcu_pending_dequeue(&bc->pending[pcpu_readers]),
-                               struct bkey_cached, rcu);
-       if (ck)
-               goto lock;
-
-       ck = allocate_dropping_locks(trans, ret,
-                                    __bkey_cached_alloc(key_u64s, _gfp));
-       if (ret) {
-               if (ck)
-                       kfree(ck->k);
-               kmem_cache_free(bch2_key_cache, ck);
-               return ERR_PTR(ret);
-       }
-
-       if (ck) {
-               bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
-               ck->c.cached = true;
-               goto lock;
-       }
-
-       ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
-                                 struct bkey_cached, rcu);
-       if (ck)
-               goto lock;
-lock:
-       six_lock_intent(&ck->c.lock, NULL, NULL);
-       six_lock_write(&ck->c.lock, NULL, NULL);
-       return ck;
-}
-
-static struct bkey_cached *
-bkey_cached_reuse(struct btree_key_cache *c)
-{
-
-       guard(rcu)();
-       struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table);
-       struct rhash_head *pos;
-       struct bkey_cached *ck;
-
-       for (unsigned i = 0; i < tbl->size; i++)
-               rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-                       if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-                           bkey_cached_lock_for_evict(ck)) {
-                               if (bkey_cached_evict(c, ck))
-                                       return ck;
-                               six_unlock_write(&ck->c.lock);
-                               six_unlock_intent(&ck->c.lock);
-                       }
-               }
-       return NULL;
-}
-
-static int btree_key_cache_create(struct btree_trans *trans,
-                                 struct btree_path *path,
-                                 struct btree_path *ck_path,
-                                 struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_key_cache *bc = &c->btree_key_cache;
-
-       /*
-        * bch2_varint_decode can read past the end of the buffer by at
-        * most 7 bytes (it won't be used):
-        */
-       unsigned key_u64s = k.k->u64s + 1;
-
-       /*
-        * Allocate some extra space so that the transaction commit path is less
-        * likely to have to reallocate, since that requires a transaction
-        * restart:
-        */
-       key_u64s = min(256U, (key_u64s * 3) / 2);
-       key_u64s = roundup_pow_of_two(key_u64s);
-
-       struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s);
-       int ret = PTR_ERR_OR_ZERO(ck);
-       if (ret)
-               return ret;
-
-       if (unlikely(!ck)) {
-               ck = bkey_cached_reuse(bc);
-               if (unlikely(!ck)) {
-                       bch_err(c, "error allocating memory for key cache item, btree %s",
-                               bch2_btree_id_str(ck_path->btree_id));
-                       return bch_err_throw(c, ENOMEM_btree_key_cache_create);
-               }
-       }
-
-       ck->c.level             = 0;
-       ck->c.btree_id          = ck_path->btree_id;
-       ck->key.btree_id        = ck_path->btree_id;
-       ck->key.pos             = ck_path->pos;
-       ck->flags               = 1U << BKEY_CACHED_ACCESSED;
-
-       if (unlikely(key_u64s > ck->u64s)) {
-               mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
-
-               struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
-                               kmalloc(key_u64s * sizeof(u64), _gfp));
-               if (unlikely(!new_k)) {
-                       bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
-                               bch2_btree_id_str(ck->key.btree_id), key_u64s);
-                       ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill);
-               } else if (ret) {
-                       kfree(new_k);
-                       goto err;
-               }
-
-               kfree(ck->k);
-               ck->k = new_k;
-               ck->u64s = key_u64s;
-       }
-
-       bkey_reassemble(ck->k, k);
-
-       ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c);
-       if (unlikely(ret))
-               goto err;
-
-       ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
-
-       bch2_btree_node_unlock_write(trans, path, path_l(path)->b);
-
-       if (unlikely(ret)) /* raced with another fill? */
-               goto err;
-
-       atomic_long_inc(&bc->nr_keys);
-       six_unlock_write(&ck->c.lock);
-
-       enum six_lock_type lock_want = __btree_lock_want(ck_path, 0);
-       if (lock_want == SIX_LOCK_read)
-               six_lock_downgrade(&ck->c.lock);
-       btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want);
-       ck_path->uptodate = BTREE_ITER_UPTODATE;
-       return 0;
-err:
-       bkey_cached_free(trans, bc, ck);
-       mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
-
-       return ret;
-}
-
-static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans,
-                                                      struct btree_path *ck_path,
-                                                      struct bkey_s_c k)
-{
-       struct printbuf buf = PRINTBUF;
-
-       bch2_bpos_to_text(&buf, ck_path->pos);
-       prt_char(&buf, ' ');
-       bch2_bkey_val_to_text(&buf, trans->c, k);
-       trace_key_cache_fill(trans, buf.buf);
-       printbuf_exit(&buf);
-}
-
-static noinline int btree_key_cache_fill(struct btree_trans *trans,
-                                        btree_path_idx_t ck_path_idx,
-                                        unsigned flags)
-{
-       struct btree_path *ck_path = trans->paths + ck_path_idx;
-
-       if (flags & BTREE_ITER_cached_nofill) {
-               ck_path->l[0].b = NULL;
-               return 0;
-       }
-
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
-                            BTREE_ITER_intent|
-                            BTREE_ITER_key_cache_fill|
-                            BTREE_ITER_cached_nofill);
-       iter.flags &= ~BTREE_ITER_with_journal;
-       k = bch2_btree_iter_peek_slot(trans, &iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       /* Recheck after btree lookup, before allocating: */
-       ck_path = trans->paths + ck_path_idx;
-       ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
-       if (unlikely(ret))
-               goto out;
-
-       ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k);
-       if (ret)
-               goto err;
-
-       if (trace_key_cache_fill_enabled())
-               do_trace_key_cache_fill(trans, ck_path, k);
-out:
-       /* We're not likely to need this iterator again: */
-       bch2_set_btree_iter_dontneed(trans, &iter);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
-                                                 btree_path_idx_t path_idx)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_cached *ck;
-       struct btree_path *path = trans->paths + path_idx;
-retry:
-       ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
-       if (!ck)
-               return -ENOENT;
-
-       enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
-       int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_);
-       if (ret)
-               return ret;
-
-       if (ck->key.btree_id != path->btree_id ||
-           !bpos_eq(ck->key.pos, path->pos)) {
-               six_unlock_type(&ck->c.lock, lock_want);
-               goto retry;
-       }
-
-       if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
-               set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-
-       btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
-       path->uptodate = BTREE_ITER_UPTODATE;
-       return 0;
-}
-
-int bch2_btree_path_traverse_cached(struct btree_trans *trans,
-                                   btree_path_idx_t path_idx,
-                                   unsigned flags)
-{
-       EBUG_ON(trans->paths[path_idx].level);
-
-       int ret;
-       do {
-               ret = btree_path_traverse_cached_fast(trans, path_idx);
-               if (unlikely(ret == -ENOENT))
-                       ret = btree_key_cache_fill(trans, path_idx, flags);
-       } while (ret == -EEXIST);
-
-       struct btree_path *path = trans->paths + path_idx;
-
-       if (unlikely(ret)) {
-               path->uptodate = BTREE_ITER_NEED_TRAVERSE;
-               if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-                       btree_node_unlock(trans, path, 0);
-                       path->l[0].b = ERR_PTR(ret);
-               }
-       } else {
-               BUG_ON(path->uptodate);
-               BUG_ON(!path->nodes_locked);
-       }
-
-       return ret;
-}
-
-static int btree_key_cache_flush_pos(struct btree_trans *trans,
-                                    struct bkey_cached_key key,
-                                    u64 journal_seq,
-                                    unsigned commit_flags,
-                                    bool evict)
-{
-       struct bch_fs *c = trans->c;
-       struct journal *j = &c->journal;
-       struct btree_iter c_iter, b_iter;
-       struct bkey_cached *ck = NULL;
-       int ret;
-
-       bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
-                            BTREE_ITER_slots|
-                            BTREE_ITER_intent|
-                            BTREE_ITER_all_snapshots);
-       bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
-                            BTREE_ITER_cached|
-                            BTREE_ITER_intent);
-       b_iter.flags &= ~BTREE_ITER_with_key_cache;
-
-       ret = bch2_btree_iter_traverse(trans, &c_iter);
-       if (ret)
-               goto out;
-
-       ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
-       if (!ck)
-               goto out;
-
-       if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-               if (evict)
-                       goto evict;
-               goto out;
-       }
-
-       if (journal_seq && ck->journal.seq != journal_seq)
-               goto out;
-
-       trans->journal_res.seq = ck->journal.seq;
-
-       /*
-        * If we're at the end of the journal, we really want to free up space
-        * in the journal right away - we don't want to pin that old journal
-        * sequence number with a new btree node write, we want to re-journal
-        * the update
-        */
-       if (ck->journal.seq == journal_last_seq(j))
-               commit_flags |= BCH_WATERMARK_reclaim;
-
-       if (ck->journal.seq != journal_last_seq(j) ||
-           !test_bit(JOURNAL_space_low, &c->journal.flags))
-               commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
-
-       struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter);
-       ret = bkey_err(btree_k);
-       if (ret)
-               goto err;
-
-       /* * Check that we're not violating cache coherency rules: */
-       BUG_ON(bkey_deleted(btree_k.k));
-
-       ret   = bch2_trans_update(trans, &b_iter, ck->k,
-                                 BTREE_UPDATE_key_cache_reclaim|
-                                 BTREE_UPDATE_internal_snapshot_node|
-                                 BTREE_TRIGGER_norun) ?:
-               bch2_trans_commit(trans, NULL, NULL,
-                                 BCH_TRANS_COMMIT_no_check_rw|
-                                 BCH_TRANS_COMMIT_no_enospc|
-                                 commit_flags);
-err:
-       bch2_fs_fatal_err_on(ret &&
-                            !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-                            !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
-                            !bch2_journal_error(j), c,
-                            "flushing key cache: %s", bch2_err_str(ret));
-       if (ret)
-               goto out;
-
-       bch2_journal_pin_drop(j, &ck->journal);
-
-       struct btree_path *path = btree_iter_path(trans, &c_iter);
-       BUG_ON(!btree_node_locked(path, 0));
-
-       if (!evict) {
-               if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-                       clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-                       atomic_long_dec(&c->btree_key_cache.nr_dirty);
-               }
-       } else {
-               struct btree_path *path2;
-               unsigned i;
-evict:
-               trans_for_each_path(trans, path2, i)
-                       if (path2 != path)
-                               __bch2_btree_path_unlock(trans, path2);
-
-               bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
-
-               if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-                       clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-                       atomic_long_dec(&c->btree_key_cache.nr_dirty);
-               }
-
-               mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
-               if (bkey_cached_evict(&c->btree_key_cache, ck)) {
-                       bkey_cached_free(trans, &c->btree_key_cache, ck);
-               } else {
-                       six_unlock_write(&ck->c.lock);
-                       six_unlock_intent(&ck->c.lock);
-               }
-       }
-out:
-       bch2_trans_iter_exit(trans, &b_iter);
-       bch2_trans_iter_exit(trans, &c_iter);
-       return ret;
-}
-
-int bch2_btree_key_cache_journal_flush(struct journal *j,
-                               struct journal_entry_pin *pin, u64 seq)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct bkey_cached *ck =
-               container_of(pin, struct bkey_cached, journal);
-       struct bkey_cached_key key;
-       struct btree_trans *trans = bch2_trans_get(c);
-       int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-       int ret = 0;
-
-       btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
-       key = ck->key;
-
-       if (ck->journal.seq != seq ||
-           !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-               six_unlock_read(&ck->c.lock);
-               goto unlock;
-       }
-
-       if (ck->seq != seq) {
-               bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
-                                       bch2_btree_key_cache_journal_flush);
-               six_unlock_read(&ck->c.lock);
-               goto unlock;
-       }
-       six_unlock_read(&ck->c.lock);
-
-       ret = lockrestart_do(trans,
-               btree_key_cache_flush_pos(trans, key, seq,
-                               BCH_TRANS_COMMIT_journal_reclaim, false));
-unlock:
-       srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
-       bch2_trans_put(trans);
-       return ret;
-}
-
-bool bch2_btree_insert_key_cached(struct btree_trans *trans,
-                                 unsigned flags,
-                                 struct btree_insert_entry *insert_entry)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
-       struct bkey_i *insert = insert_entry->k;
-       bool kick_reclaim = false;
-
-       BUG_ON(insert->k.u64s > ck->u64s);
-
-       bkey_copy(ck->k, insert);
-
-       if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-               EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-               set_bit(BKEY_CACHED_DIRTY, &ck->flags);
-               atomic_long_inc(&c->btree_key_cache.nr_dirty);
-
-               if (bch2_nr_btree_keys_need_flush(c))
-                       kick_reclaim = true;
-       }
-
-       /*
-        * To minimize lock contention, we only add the journal pin here and
-        * defer pin updates to the flush callback via ->seq. Be careful not to
-        * update ->seq on nojournal commits because we don't want to update the
-        * pin to a seq that doesn't include journal updates on disk. Otherwise
-        * we risk losing the update after a crash.
-        *
-        * The only exception is if the pin is not active in the first place. We
-        * have to add the pin because journal reclaim drives key cache
-        * flushing. The flush callback will not proceed unless ->seq matches
-        * the latest pin, so make sure it starts with a consistent value.
-        */
-       if (!(insert_entry->flags & BTREE_UPDATE_nojournal) ||
-           !journal_pin_active(&ck->journal)) {
-               ck->seq = trans->journal_res.seq;
-       }
-       bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-                            &ck->journal, bch2_btree_key_cache_journal_flush);
-
-       if (kick_reclaim)
-               journal_reclaim_kick(&c->journal);
-       return true;
-}
-
-void bch2_btree_key_cache_drop(struct btree_trans *trans,
-                              struct btree_path *path)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_key_cache *bc = &c->btree_key_cache;
-       struct bkey_cached *ck = (void *) path->l[0].b;
-
-       /*
-        * We just did an update to the btree, bypassing the key cache: the key
-        * cache key is now stale and must be dropped, even if dirty:
-        */
-       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-               clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-               atomic_long_dec(&c->btree_key_cache.nr_dirty);
-               bch2_journal_pin_drop(&c->journal, &ck->journal);
-       }
-
-       bkey_cached_evict(bc, ck);
-       bkey_cached_free(trans, bc, ck);
-
-       mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
-
-       struct btree_path *path2;
-       unsigned i;
-       trans_for_each_path(trans, path2, i)
-               if (path2->l[0].b == (void *) ck) {
-                       /*
-                        * It's safe to clear should_be_locked here because
-                        * we're evicting from the key cache, and we still have
-                        * the underlying btree locked: filling into the key
-                        * cache would require taking a write lock on the btree
-                        * node
-                        */
-                       path2->should_be_locked = false;
-                       __bch2_btree_path_unlock(trans, path2);
-                       path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
-                       btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE);
-               }
-
-       bch2_trans_verify_locks(trans);
-}
-
-static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
-                                          struct shrink_control *sc)
-{
-       struct bch_fs *c = shrink->private_data;
-       struct btree_key_cache *bc = &c->btree_key_cache;
-       struct bucket_table *tbl;
-       struct bkey_cached *ck;
-       size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
-       unsigned iter, start;
-       int srcu_idx;
-
-       srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-       rcu_read_lock();
-
-       tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-
-       /*
-        * Scanning is expensive while a rehash is in progress - most elements
-        * will be on the new hashtable, if it's in progress
-        *
-        * A rehash could still start while we're scanning - that's ok, we'll
-        * still see most elements.
-        */
-       if (unlikely(tbl->nest)) {
-               rcu_read_unlock();
-               srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-               return SHRINK_STOP;
-       }
-
-       iter = bc->shrink_iter;
-       if (iter >= tbl->size)
-               iter = 0;
-       start = iter;
-
-       do {
-               struct rhash_head *pos, *next;
-
-               pos = rht_ptr_rcu(&tbl->buckets[iter]);
-
-               while (!rht_is_a_nulls(pos)) {
-                       next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
-                       ck = container_of(pos, struct bkey_cached, hash);
-
-                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-                               bc->skipped_dirty++;
-                       } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
-                               clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-                               bc->skipped_accessed++;
-                       } else if (!bkey_cached_lock_for_evict(ck)) {
-                               bc->skipped_lock_fail++;
-                       } else if (bkey_cached_evict(bc, ck)) {
-                               bkey_cached_free_noassert(bc, ck);
-                               bc->freed++;
-                               freed++;
-                       } else {
-                               six_unlock_write(&ck->c.lock);
-                               six_unlock_intent(&ck->c.lock);
-                       }
-
-                       scanned++;
-                       if (scanned >= nr)
-                               goto out;
-
-                       pos = next;
-               }
-
-               iter++;
-               if (iter >= tbl->size)
-                       iter = 0;
-       } while (scanned < nr && iter != start);
-out:
-       bc->shrink_iter = iter;
-
-       rcu_read_unlock();
-       srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
-       return freed;
-}
-
-static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
-                                           struct shrink_control *sc)
-{
-       struct bch_fs *c = shrink->private_data;
-       struct btree_key_cache *bc = &c->btree_key_cache;
-       long nr = atomic_long_read(&bc->nr_keys) -
-               atomic_long_read(&bc->nr_dirty);
-
-       /*
-        * Avoid hammering our shrinker too much if it's nearly empty - the
-        * shrinker code doesn't take into account how big our cache is, if it's
-        * mostly empty but the system is under memory pressure it causes nasty
-        * lock contention:
-        */
-       nr -= 128;
-
-       return max(0L, nr);
-}
-
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
-{
-       struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-       struct bucket_table *tbl;
-       struct bkey_cached *ck;
-       struct rhash_head *pos;
-       LIST_HEAD(items);
-       unsigned i;
-
-       shrinker_free(bc->shrink);
-
-       /*
-        * The loop is needed to guard against racing with rehash:
-        */
-       while (atomic_long_read(&bc->nr_keys)) {
-               rcu_read_lock();
-               tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-               if (tbl) {
-                       if (tbl->nest) {
-                               /* wait for in progress rehash */
-                               rcu_read_unlock();
-                               mutex_lock(&bc->table.mutex);
-                               mutex_unlock(&bc->table.mutex);
-                               continue;
-                       }
-                       for (i = 0; i < tbl->size; i++)
-                               while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
-                                       ck = container_of(pos, struct bkey_cached, hash);
-                                       BUG_ON(!bkey_cached_evict(bc, ck));
-                                       kfree(ck->k);
-                                       kmem_cache_free(bch2_key_cache, ck);
-                               }
-               }
-               rcu_read_unlock();
-       }
-
-       if (atomic_long_read(&bc->nr_dirty) &&
-           !bch2_journal_error(&c->journal) &&
-           test_bit(BCH_FS_was_rw, &c->flags))
-               panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
-                     atomic_long_read(&bc->nr_dirty));
-
-       if (atomic_long_read(&bc->nr_keys))
-               panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
-                     atomic_long_read(&bc->nr_keys));
-
-       if (bc->table_init_done)
-               rhashtable_destroy(&bc->table);
-
-       rcu_pending_exit(&bc->pending[0]);
-       rcu_pending_exit(&bc->pending[1]);
-
-       free_percpu(bc->nr_pending);
-}
-
-void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
-{
-}
-
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
-{
-       struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-       struct shrinker *shrink;
-
-       bc->nr_pending = alloc_percpu(size_t);
-       if (!bc->nr_pending)
-               return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-
-       if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
-           rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
-               return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-
-       if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
-               return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-
-       bc->table_init_done = true;
-
-       shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
-       if (!shrink)
-               return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-       bc->shrink = shrink;
-       shrink->count_objects   = bch2_btree_key_cache_count;
-       shrink->scan_objects    = bch2_btree_key_cache_scan;
-       shrink->batch           = 1 << 14;
-       shrink->seeks           = 0;
-       shrink->private_data    = c;
-       shrinker_register(shrink);
-       return 0;
-}
-
-void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
-{
-       printbuf_tabstop_push(out, 24);
-       printbuf_tabstop_push(out, 12);
-
-       prt_printf(out, "keys:\t%lu\r\n",               atomic_long_read(&bc->nr_keys));
-       prt_printf(out, "dirty:\t%lu\r\n",              atomic_long_read(&bc->nr_dirty));
-       prt_printf(out, "table size:\t%u\r\n",          bc->table.tbl->size);
-       prt_newline(out);
-       prt_printf(out, "shrinker:\n");
-       prt_printf(out, "requested_to_free:\t%lu\r\n",  bc->requested_to_free);
-       prt_printf(out, "freed:\t%lu\r\n",              bc->freed);
-       prt_printf(out, "skipped_dirty:\t%lu\r\n",      bc->skipped_dirty);
-       prt_printf(out, "skipped_accessed:\t%lu\r\n",   bc->skipped_accessed);
-       prt_printf(out, "skipped_lock_fail:\t%lu\r\n",  bc->skipped_lock_fail);
-       prt_newline(out);
-       prt_printf(out, "pending:\t%zu\r\n",            per_cpu_sum(bc->nr_pending));
-}
-
-void bch2_btree_key_cache_exit(void)
-{
-       kmem_cache_destroy(bch2_key_cache);
-}
-
-int __init bch2_btree_key_cache_init(void)
-{
-       bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
-       if (!bch2_key_cache)
-               return -ENOMEM;
-
-       return 0;
-}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
deleted file mode 100644 (file)
index 82d8c72..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
-#define _BCACHEFS_BTREE_KEY_CACHE_H
-
-static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
-{
-       size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
-       size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
-       size_t max_dirty = 1024 + nr_keys  / 2;
-
-       return max_t(ssize_t, 0, nr_dirty - max_dirty);
-}
-
-static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c)
-{
-       size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
-       size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
-       size_t max_dirty = 4096 + (nr_keys * 3) / 4;
-
-       return nr_dirty - max_dirty;
-}
-
-static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
-{
-       return __bch2_btree_key_cache_must_wait(c) > 0;
-}
-
-static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c)
-{
-       size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
-       size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
-       size_t max_dirty = 2048 + (nr_keys * 5) / 8;
-
-       return nr_dirty <= max_dirty;
-}
-
-int bch2_btree_key_cache_journal_flush(struct journal *,
-                               struct journal_entry_pin *, u64);
-
-struct bkey_cached *
-bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
-
-int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned);
-
-bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
-                       struct btree_insert_entry *);
-void bch2_btree_key_cache_drop(struct btree_trans *,
-                              struct btree_path *);
-
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
-void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
-
-void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
-
-void bch2_btree_key_cache_exit(void);
-int __init bch2_btree_key_cache_init(void);
-
-#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
deleted file mode 100644 (file)
index 722f1ed..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-
-#include "rcu_pending.h"
-
-struct btree_key_cache {
-       struct rhashtable       table;
-       bool                    table_init_done;
-
-       struct shrinker         *shrink;
-       unsigned                shrink_iter;
-
-       /* 0: non pcpu reader locks, 1: pcpu reader locks */
-       struct rcu_pending      pending[2];
-       size_t __percpu         *nr_pending;
-
-       atomic_long_t           nr_keys;
-       atomic_long_t           nr_dirty;
-
-       /* shrinker stats */
-       unsigned long           requested_to_free;
-       unsigned long           freed;
-       unsigned long           skipped_dirty;
-       unsigned long           skipped_accessed;
-       unsigned long           skipped_lock_fail;
-};
-
-struct bkey_cached_key {
-       u32                     btree_id;
-       struct bpos             pos;
-} __packed __aligned(4);
-
-#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
deleted file mode 100644 (file)
index bed2b4b..0000000
+++ /dev/null
@@ -1,936 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_locking.h"
-#include "btree_types.h"
-
-static struct lock_class_key bch2_btree_node_lock_key;
-
-void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
-                         enum six_lock_init_flags flags,
-                         gfp_t gfp)
-{
-       __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp);
-       lockdep_set_notrack_class(&b->lock);
-}
-
-/* Btree node locking: */
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
-                                                 struct btree_path *skip,
-                                                 struct btree_bkey_cached_common *b,
-                                                 unsigned level)
-{
-       struct btree_path *path;
-       struct six_lock_count ret;
-       unsigned i;
-
-       memset(&ret, 0, sizeof(ret));
-
-       if (IS_ERR_OR_NULL(b))
-               return ret;
-
-       trans_for_each_path(trans, path, i)
-               if (path != skip && &path->l[level].b->c == b) {
-                       int t = btree_node_locked_type(path, level);
-
-                       if (t != BTREE_NODE_UNLOCKED)
-                               ret.n[t]++;
-               }
-
-       return ret;
-}
-
-/* unlock */
-
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
-                       struct btree_path *path, struct btree *b)
-{
-       bch2_btree_node_unlock_write_inlined(trans, path, b);
-}
-
-/* lock */
-
-/*
- * @trans wants to lock @b with type @type
- */
-struct trans_waiting_for_lock {
-       struct btree_trans              *trans;
-       struct btree_bkey_cached_common *node_want;
-       enum six_lock_type              lock_want;
-
-       /* for iterating over held locks :*/
-       u8                              path_idx;
-       u8                              level;
-       u64                             lock_start_time;
-};
-
-struct lock_graph {
-       struct trans_waiting_for_lock   g[8];
-       unsigned                        nr;
-};
-
-static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
-{
-       struct trans_waiting_for_lock *i;
-
-       prt_printf(out, "Found lock cycle (%u entries):\n", g->nr);
-
-       for (i = g->g; i < g->g + g->nr; i++) {
-               struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
-               if (!task)
-                       continue;
-
-               bch2_btree_trans_to_text(out, i->trans);
-               bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
-       }
-}
-
-static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
-{
-       struct trans_waiting_for_lock *i;
-
-       for (i = g->g; i != g->g + g->nr; i++) {
-               struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
-               if (i != g->g)
-                       prt_str(out, "<- ");
-               prt_printf(out, "%u ", task ? task->pid : 0);
-       }
-       prt_newline(out);
-}
-
-static void lock_graph_up(struct lock_graph *g)
-{
-       closure_put(&g->g[--g->nr].trans->ref);
-}
-
-static noinline void lock_graph_pop_all(struct lock_graph *g)
-{
-       while (g->nr)
-               lock_graph_up(g);
-}
-
-static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i)
-{
-       while (g->g + g->nr > i)
-               lock_graph_up(g);
-}
-
-static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
-{
-       g->g[g->nr++] = (struct trans_waiting_for_lock) {
-               .trans          = trans,
-               .node_want      = trans->locking,
-               .lock_want      = trans->locking_wait.lock_want,
-       };
-}
-
-static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
-{
-       closure_get(&trans->ref);
-       __lock_graph_down(g, trans);
-}
-
-static bool lock_graph_remove_non_waiters(struct lock_graph *g,
-                                         struct trans_waiting_for_lock *from)
-{
-       struct trans_waiting_for_lock *i;
-
-       if (from->trans->locking != from->node_want) {
-               lock_graph_pop_from(g, from);
-               return true;
-       }
-
-       for (i = from + 1; i < g->g + g->nr; i++)
-               if (i->trans->locking != i->node_want ||
-                   i->trans->locking_wait.start_time != i[-1].lock_start_time) {
-                       lock_graph_pop_from(g, i);
-                       return true;
-               }
-
-       return false;
-}
-
-static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-
-       count_event(c, trans_restart_would_deadlock);
-
-       if (trace_trans_restart_would_deadlock_enabled()) {
-               struct printbuf buf = PRINTBUF;
-
-               buf.atomic++;
-               print_cycle(&buf, g);
-
-               trace_trans_restart_would_deadlock(trans, buf.buf);
-               printbuf_exit(&buf);
-       }
-}
-
-static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
-{
-       if (i == g->g) {
-               trace_would_deadlock(g, i->trans);
-               return btree_trans_restart_foreign_task(i->trans,
-                                       BCH_ERR_transaction_restart_would_deadlock,
-                                       _THIS_IP_);
-       } else {
-               i->trans->lock_must_abort = true;
-               wake_up_process(i->trans->locking_wait.task);
-               return 0;
-       }
-}
-
-static int btree_trans_abort_preference(struct btree_trans *trans)
-{
-       if (trans->lock_may_not_fail)
-               return 0;
-       if (trans->locking_wait.lock_want == SIX_LOCK_write)
-               return 1;
-       if (!trans->in_traverse_all)
-               return 2;
-       return 3;
-}
-
-static noinline __noreturn void break_cycle_fail(struct lock_graph *g)
-{
-       struct printbuf buf = PRINTBUF;
-       buf.atomic++;
-
-       prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
-
-       for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) {
-               struct btree_trans *trans = i->trans;
-
-               bch2_btree_trans_to_text(&buf, trans);
-
-               prt_printf(&buf, "backtrace:\n");
-               printbuf_indent_add(&buf, 2);
-               bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
-               printbuf_indent_sub(&buf, 2);
-               prt_newline(&buf);
-       }
-
-       bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf);
-       printbuf_exit(&buf);
-       BUG();
-}
-
-static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
-                               struct trans_waiting_for_lock *from)
-{
-       struct trans_waiting_for_lock *i, *abort = NULL;
-       unsigned best = 0, pref;
-       int ret;
-
-       if (lock_graph_remove_non_waiters(g, from))
-               return 0;
-
-       /* Only checking, for debugfs: */
-       if (cycle) {
-               print_cycle(cycle, g);
-               ret = -1;
-               goto out;
-       }
-
-       for (i = from; i < g->g + g->nr; i++) {
-               pref = btree_trans_abort_preference(i->trans);
-               if (pref > best) {
-                       abort = i;
-                       best = pref;
-               }
-       }
-
-       if (unlikely(!best))
-               break_cycle_fail(g);
-
-       ret = abort_lock(g, abort);
-out:
-       if (ret)
-               lock_graph_pop_all(g);
-       else
-               lock_graph_pop_from(g, abort);
-       return ret;
-}
-
-static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
-                             struct printbuf *cycle)
-{
-       struct btree_trans *orig_trans = g->g->trans;
-
-       for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++)
-               if (i->trans == trans) {
-                       closure_put(&trans->ref);
-                       return break_cycle(g, cycle, i);
-               }
-
-       if (unlikely(g->nr == ARRAY_SIZE(g->g))) {
-               closure_put(&trans->ref);
-
-               if (orig_trans->lock_may_not_fail)
-                       return 0;
-
-               lock_graph_pop_all(g);
-
-               if (cycle)
-                       return 0;
-
-               trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
-               return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
-       }
-
-       __lock_graph_down(g, trans);
-       return 0;
-}
-
-static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
-{
-       return t1 + t2 > 1;
-}
-
-int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
-{
-       struct lock_graph g;
-       struct trans_waiting_for_lock *top;
-       struct btree_bkey_cached_common *b;
-       btree_path_idx_t path_idx;
-       int ret = 0;
-
-       g.nr = 0;
-
-       if (trans->lock_must_abort && !trans->lock_may_not_fail) {
-               if (cycle)
-                       return -1;
-
-               trace_would_deadlock(&g, trans);
-               return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
-       }
-
-       lock_graph_down(&g, trans);
-
-       /* trans->paths is rcu protected vs. freeing */
-       guard(rcu)();
-       if (cycle)
-               cycle->atomic++;
-next:
-       if (!g.nr)
-               goto out;
-
-       top = &g.g[g.nr - 1];
-
-       struct btree_path *paths = rcu_dereference(top->trans->paths);
-       if (!paths)
-               goto up;
-
-       unsigned long *paths_allocated = trans_paths_allocated(paths);
-
-       trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths),
-                                    path_idx, top->path_idx) {
-               struct btree_path *path = paths + path_idx;
-               if (!path->nodes_locked)
-                       continue;
-
-               if (path_idx != top->path_idx) {
-                       top->path_idx           = path_idx;
-                       top->level              = 0;
-                       top->lock_start_time    = 0;
-               }
-
-               for (;
-                    top->level < BTREE_MAX_DEPTH;
-                    top->level++, top->lock_start_time = 0) {
-                       int lock_held = btree_node_locked_type(path, top->level);
-
-                       if (lock_held == BTREE_NODE_UNLOCKED)
-                               continue;
-
-                       b = &READ_ONCE(path->l[top->level].b)->c;
-
-                       if (IS_ERR_OR_NULL(b)) {
-                               /*
-                                * If we get here, it means we raced with the
-                                * other thread updating its btree_path
-                                * structures - which means it can't be blocked
-                                * waiting on a lock:
-                                */
-                               if (!lock_graph_remove_non_waiters(&g, g.g)) {
-                                       /*
-                                        * If lock_graph_remove_non_waiters()
-                                        * didn't do anything, it must be
-                                        * because we're being called by debugfs
-                                        * checking for lock cycles, which
-                                        * invokes us on btree_transactions that
-                                        * aren't actually waiting on anything.
-                                        * Just bail out:
-                                        */
-                                       lock_graph_pop_all(&g);
-                               }
-
-                               goto next;
-                       }
-
-                       if (list_empty_careful(&b->lock.wait_list))
-                               continue;
-
-                       raw_spin_lock(&b->lock.wait_lock);
-                       list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
-                               BUG_ON(b != trans->locking);
-
-                               if (top->lock_start_time &&
-                                   time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
-                                       continue;
-
-                               top->lock_start_time = trans->locking_wait.start_time;
-
-                               /* Don't check for self deadlock: */
-                               if (trans == top->trans ||
-                                   !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
-                                       continue;
-
-                               closure_get(&trans->ref);
-                               raw_spin_unlock(&b->lock.wait_lock);
-
-                               ret = lock_graph_descend(&g, trans, cycle);
-                               if (ret)
-                                       goto out;
-                               goto next;
-
-                       }
-                       raw_spin_unlock(&b->lock.wait_lock);
-               }
-       }
-up:
-       if (g.nr > 1 && cycle)
-               print_chain(cycle, &g);
-       lock_graph_up(&g);
-       goto next;
-out:
-       if (cycle)
-               --cycle->atomic;
-       return ret;
-}
-
-int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
-{
-       struct btree_trans *trans = p;
-
-       return bch2_check_for_deadlock(trans, NULL);
-}
-
-int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
-                                struct btree_bkey_cached_common *b,
-                                bool lock_may_not_fail)
-{
-       int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
-       int ret;
-
-       /*
-        * Must drop our read locks before calling six_lock_write() -
-        * six_unlock() won't do wakeups until the reader count
-        * goes to 0, and it's safe because we have the node intent
-        * locked:
-        */
-       six_lock_readers_add(&b->lock, -readers);
-       ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
-                                      lock_may_not_fail, _RET_IP_);
-       six_lock_readers_add(&b->lock, readers);
-
-       if (ret)
-               mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
-
-       return ret;
-}
-
-void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
-                                      struct btree_path *path,
-                                      struct btree_bkey_cached_common *b)
-{
-       int ret = __btree_node_lock_write(trans, path, b, true);
-       BUG_ON(ret);
-}
-
-/* relock */
-
-static int btree_path_get_locks(struct btree_trans *trans,
-                               struct btree_path *path,
-                               bool upgrade,
-                               struct get_locks_fail *f,
-                               int restart_err)
-{
-       unsigned l = path->level;
-
-       do {
-               if (!btree_path_node(path, l))
-                       break;
-
-               if (!(upgrade
-                     ? bch2_btree_node_upgrade(trans, path, l)
-                     : bch2_btree_node_relock(trans, path, l)))
-                       goto err;
-
-               l++;
-       } while (l < path->locks_want);
-
-       if (path->uptodate == BTREE_ITER_NEED_RELOCK)
-               path->uptodate = BTREE_ITER_UPTODATE;
-
-       return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1;
-err:
-       if (f) {
-               f->l    = l;
-               f->b    = path->l[l].b;
-       }
-
-       /*
-        * Do transaction restart before unlocking, so we don't pop
-        * should_be_locked asserts
-        */
-       if (restart_err) {
-               btree_trans_restart(trans, restart_err);
-       } else if (path->should_be_locked && !trans->restarted) {
-               if (upgrade)
-                       path->locks_want = l;
-               return -1;
-       }
-
-       __bch2_btree_path_unlock(trans, path);
-       btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-
-       /*
-        * When we fail to get a lock, we have to ensure that any child nodes
-        * can't be relocked so bch2_btree_path_traverse has to walk back up to
-        * the node that we failed to relock:
-        */
-       do {
-               path->l[l].b = upgrade
-                       ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
-                       : ERR_PTR(-BCH_ERR_no_btree_node_relock);
-       } while (l--);
-
-       return -restart_err ?: -1;
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *trans,
-                             struct btree_path *path, unsigned level,
-                             bool trace)
-{
-       struct btree *b = btree_path_node(path, level);
-       int want = __btree_lock_want(path, level);
-
-       if (race_fault())
-               goto fail;
-
-       if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
-           (btree_node_lock_seq_matches(path, b, level) &&
-            btree_node_lock_increment(trans, &b->c, level, want))) {
-               mark_btree_node_locked(trans, path, level, want);
-               return true;
-       }
-fail:
-       if (trace && !trans->notrace_relock_fail)
-               trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
-       return false;
-}
-
-/* upgrade */
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
-                            struct btree_path *path, unsigned level)
-{
-       struct btree *b = path->l[level].b;
-
-       if (!is_btree_node(path, level))
-               return false;
-
-       switch (btree_lock_want(path, level)) {
-       case BTREE_NODE_UNLOCKED:
-               BUG_ON(btree_node_locked(path, level));
-               return true;
-       case BTREE_NODE_READ_LOCKED:
-               BUG_ON(btree_node_intent_locked(path, level));
-               return bch2_btree_node_relock(trans, path, level);
-       case BTREE_NODE_INTENT_LOCKED:
-               break;
-       case BTREE_NODE_WRITE_LOCKED:
-               BUG();
-       }
-
-       if (btree_node_intent_locked(path, level))
-               return true;
-
-       if (race_fault())
-               return false;
-
-       if (btree_node_locked(path, level)
-           ? six_lock_tryupgrade(&b->c.lock)
-           : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
-               goto success;
-
-       if (btree_node_lock_seq_matches(path, b, level) &&
-           btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
-               btree_node_unlock(trans, path, level);
-               goto success;
-       }
-
-       trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
-       return false;
-success:
-       mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
-       return true;
-}
-
-/* Btree path locking: */
-
-/*
- * Only for btree_cache.c - only relocks intent locks
- */
-int bch2_btree_path_relock_intent(struct btree_trans *trans,
-                                 struct btree_path *path)
-{
-       unsigned l;
-
-       for (l = path->level;
-            l < path->locks_want && btree_path_node(path, l);
-            l++) {
-               if (!bch2_btree_node_relock(trans, path, l)) {
-                       __bch2_btree_path_unlock(trans, path);
-                       btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-                       trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
-                       return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
-               }
-       }
-
-       return 0;
-}
-
-__flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
-{
-       bool ret = !btree_path_get_locks(trans, path, false, NULL, 0);
-       bch2_trans_verify_locks(trans);
-       return ret;
-}
-
-int __bch2_btree_path_relock(struct btree_trans *trans,
-                       struct btree_path *path, unsigned long trace_ip)
-{
-       if (!bch2_btree_path_relock_norestart(trans, path)) {
-               trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
-               return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
-       }
-
-       return 0;
-}
-
-bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
-                                        struct btree_path *path,
-                                        unsigned new_locks_want)
-{
-       path->locks_want = new_locks_want;
-
-       /*
-        * If we need it locked, we can't touch it. Otherwise, we can return
-        * success - bch2_path_get() will use this path, and it'll just be
-        * retraversed:
-        */
-       bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) ||
-               !path->should_be_locked;
-
-       bch2_btree_path_verify_locks(trans, path);
-       return ret;
-}
-
-int __bch2_btree_path_upgrade(struct btree_trans *trans,
-                             struct btree_path *path,
-                             unsigned new_locks_want)
-{
-       unsigned old_locks = path->nodes_locked;
-       unsigned old_locks_want = path->locks_want;
-
-       path->locks_want = max_t(unsigned, path->locks_want, new_locks_want);
-
-       struct get_locks_fail f = {};
-       int ret = btree_path_get_locks(trans, path, true, &f,
-                               BCH_ERR_transaction_restart_upgrade);
-       if (!ret)
-               goto out;
-
-       /*
-        * XXX: this is ugly - we'd prefer to not be mucking with other
-        * iterators in the btree_trans here.
-        *
-        * On failure to upgrade the iterator, setting iter->locks_want and
-        * calling get_locks() is sufficient to make bch2_btree_path_traverse()
-        * get the locks we want on transaction restart.
-        *
-        * But if this iterator was a clone, on transaction restart what we did
-        * to this iterator isn't going to be preserved.
-        *
-        * Possibly we could add an iterator field for the parent iterator when
-        * an iterator is a copy - for now, we'll just upgrade any other
-        * iterators with the same btree id.
-        *
-        * The code below used to be needed to ensure ancestor nodes get locked
-        * before interior nodes - now that's handled by
-        * bch2_btree_path_traverse_all().
-        */
-       if (!path->cached && !trans->in_traverse_all) {
-               struct btree_path *linked;
-               unsigned i;
-
-               trans_for_each_path(trans, linked, i)
-                       if (linked != path &&
-                           linked->cached == path->cached &&
-                           linked->btree_id == path->btree_id &&
-                           linked->locks_want < new_locks_want) {
-                               linked->locks_want = new_locks_want;
-                               btree_path_get_locks(trans, linked, true, NULL, 0);
-                       }
-       }
-
-       count_event(trans->c, trans_restart_upgrade);
-       if (trace_trans_restart_upgrade_enabled()) {
-               struct printbuf buf = PRINTBUF;
-
-               prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_);
-               prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id));
-               bch2_bpos_to_text(&buf, path->pos);
-               prt_printf(&buf, "locks want %u -> %u level %u\n",
-                          old_locks_want, new_locks_want, f.l);
-               prt_printf(&buf, "nodes_locked %x -> %x\n",
-                          old_locks, path->nodes_locked);
-               prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) :
-                          !f.b ? "(null)" : "(node)");
-               prt_printf(&buf, "path seq %u node seq %u\n",
-                          IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq,
-                          path->l[f.l].lock_seq);
-
-               trace_trans_restart_upgrade(trans->c, buf.buf);
-               printbuf_exit(&buf);
-       }
-out:
-       bch2_trans_verify_locks(trans);
-       return ret;
-}
-
-void __bch2_btree_path_downgrade(struct btree_trans *trans,
-                                struct btree_path *path,
-                                unsigned new_locks_want)
-{
-       unsigned l, old_locks_want = path->locks_want;
-
-       if (trans->restarted)
-               return;
-
-       EBUG_ON(path->locks_want < new_locks_want);
-
-       path->locks_want = new_locks_want;
-
-       while (path->nodes_locked &&
-              (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
-               if (l > path->level) {
-                       btree_node_unlock(trans, path, l);
-               } else {
-                       if (btree_node_intent_locked(path, l)) {
-                               six_lock_downgrade(&path->l[l].b->c.lock);
-                               mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
-                       }
-                       break;
-               }
-       }
-
-       bch2_btree_path_verify_locks(trans, path);
-
-       trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
-}
-
-/* Btree transaction locking: */
-
-void bch2_trans_downgrade(struct btree_trans *trans)
-{
-       struct btree_path *path;
-       unsigned i;
-
-       if (trans->restarted)
-               return;
-
-       trans_for_each_path(trans, path, i)
-               if (path->ref)
-                       bch2_btree_path_downgrade(trans, path);
-}
-
-static inline void __bch2_trans_unlock(struct btree_trans *trans)
-{
-       struct btree_path *path;
-       unsigned i;
-
-       trans_for_each_path(trans, path, i)
-               __bch2_btree_path_unlock(trans, path);
-}
-
-static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
-                                                  struct get_locks_fail *f, bool trace, ulong ip)
-{
-       if (!trace)
-               goto out;
-
-       if (trace_trans_restart_relock_enabled()) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_bpos_to_text(&buf, path->pos);
-               prt_printf(&buf, " %s l=%u seq=%u node seq=",
-                          bch2_btree_id_str(path->btree_id),
-                          f->l, path->l[f->l].lock_seq);
-               if (IS_ERR_OR_NULL(f->b)) {
-                       prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
-               } else {
-                       prt_printf(&buf, "%u", f->b->c.lock.seq);
-
-                       struct six_lock_count c =
-                               bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l);
-                       prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-
-                       c = six_lock_counts(&f->b->c.lock);
-                       prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-               }
-
-               trace_trans_restart_relock(trans, ip, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       count_event(trans->c, trans_restart_relock);
-out:
-       __bch2_trans_unlock(trans);
-       bch2_trans_verify_locks(trans);
-}
-
-static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip)
-{
-       bch2_trans_verify_locks(trans);
-
-       if (unlikely(trans->restarted))
-               return -((int) trans->restarted);
-       if (unlikely(trans->locked))
-               goto out;
-
-       struct btree_path *path;
-       unsigned i;
-
-       trans_for_each_path(trans, path, i) {
-               struct get_locks_fail f;
-               int ret;
-
-               if (path->should_be_locked &&
-                   (ret = btree_path_get_locks(trans, path, false, &f,
-                                       BCH_ERR_transaction_restart_relock))) {
-                       bch2_trans_relock_fail(trans, path, &f, trace, ip);
-                       return ret;
-               }
-       }
-
-       trans_set_locked(trans, true);
-out:
-       bch2_trans_verify_locks(trans);
-       return 0;
-}
-
-int bch2_trans_relock(struct btree_trans *trans)
-{
-       return __bch2_trans_relock(trans, true, _RET_IP_);
-}
-
-int bch2_trans_relock_notrace(struct btree_trans *trans)
-{
-       return __bch2_trans_relock(trans, false, _RET_IP_);
-}
-
-void bch2_trans_unlock(struct btree_trans *trans)
-{
-       trans_set_unlocked(trans);
-
-       __bch2_trans_unlock(trans);
-}
-
-void bch2_trans_unlock_long(struct btree_trans *trans)
-{
-       bch2_trans_unlock(trans);
-       bch2_trans_srcu_unlock(trans);
-}
-
-void bch2_trans_unlock_write(struct btree_trans *trans)
-{
-       struct btree_path *path;
-       unsigned i;
-
-       trans_for_each_path(trans, path, i)
-               for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++)
-                       if (btree_node_write_locked(path, l))
-                               bch2_btree_node_unlock_write(trans, path, path->l[l].b);
-}
-
-int __bch2_trans_mutex_lock(struct btree_trans *trans,
-                           struct mutex *lock)
-{
-       int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
-
-       if (ret)
-               mutex_unlock(lock);
-       return ret;
-}
-
-/* Debug */
-
-void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path)
-{
-       if (!path->nodes_locked && btree_path_node(path, path->level)) {
-               /*
-                * A path may be uptodate and yet have nothing locked if and only if
-                * there is no node at path->level, which generally means we were
-                * iterating over all nodes and got to the end of the btree
-                */
-               BUG_ON(path->uptodate == BTREE_ITER_UPTODATE);
-               BUG_ON(path->should_be_locked && trans->locked && !trans->restarted);
-       }
-
-       if (!path->nodes_locked)
-               return;
-
-       for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
-               int want = btree_lock_want(path, l);
-               int have = btree_node_locked_type_nowrite(path, l);
-
-               BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
-
-               BUG_ON(is_btree_node(path, l) && want != have);
-
-               BUG_ON(btree_node_locked(path, l) &&
-                      path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
-       }
-}
-
-static bool bch2_trans_locked(struct btree_trans *trans)
-{
-       struct btree_path *path;
-       unsigned i;
-
-       trans_for_each_path(trans, path, i)
-               if (path->nodes_locked)
-                       return true;
-       return false;
-}
-
-void __bch2_trans_verify_locks(struct btree_trans *trans)
-{
-       if (!trans->locked) {
-               BUG_ON(bch2_trans_locked(trans));
-               return;
-       }
-
-       struct btree_path *path;
-       unsigned i;
-
-       trans_for_each_path(trans, path, i)
-               __bch2_btree_path_verify_locks(trans, path);
-}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
deleted file mode 100644 (file)
index f2173a3..0000000
+++ /dev/null
@@ -1,466 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_LOCKING_H
-#define _BCACHEFS_BTREE_LOCKING_H
-
-/*
- * Only for internal btree use:
- *
- * The btree iterator tracks what locks it wants to take, and what locks it
- * currently has - here we have wrappers for locking/unlocking btree nodes and
- * updating the iterator state
- */
-
-#include "btree_iter.h"
-#include "six.h"
-
-void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
-
-void bch2_trans_unlock_write(struct btree_trans *);
-
-static inline bool is_btree_node(struct btree_path *path, unsigned l)
-{
-       return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
-}
-
-static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
-{
-       return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
-               ? &trans->c->btree_transaction_stats[trans->fn_idx]
-               : NULL;
-}
-
-/* matches six lock types */
-enum btree_node_locked_type {
-       BTREE_NODE_UNLOCKED             = -1,
-       BTREE_NODE_READ_LOCKED          = SIX_LOCK_read,
-       BTREE_NODE_INTENT_LOCKED        = SIX_LOCK_intent,
-       BTREE_NODE_WRITE_LOCKED         = SIX_LOCK_write,
-};
-
-static inline int btree_node_locked_type(struct btree_path *path,
-                                        unsigned level)
-{
-       return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
-}
-
-static inline int btree_node_locked_type_nowrite(struct btree_path *path,
-                                                unsigned level)
-{
-       int have = btree_node_locked_type(path, level);
-       return have == BTREE_NODE_WRITE_LOCKED
-               ? BTREE_NODE_INTENT_LOCKED
-               : have;
-}
-
-static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
-{
-       return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
-}
-
-static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
-{
-       return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
-}
-
-static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
-{
-       return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
-}
-
-static inline bool btree_node_locked(struct btree_path *path, unsigned level)
-{
-       return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
-}
-
-static inline void mark_btree_node_locked_noreset(struct btree_path *path,
-                                                 unsigned level,
-                                                 enum btree_node_locked_type type)
-{
-       /* relying on this to avoid a branch */
-       BUILD_BUG_ON(SIX_LOCK_read   != 0);
-       BUILD_BUG_ON(SIX_LOCK_intent != 1);
-
-       path->nodes_locked &= ~(3U << (level << 1));
-       path->nodes_locked |= (type + 1) << (level << 1);
-}
-
-static inline void mark_btree_node_locked(struct btree_trans *trans,
-                                         struct btree_path *path,
-                                         unsigned level,
-                                         enum btree_node_locked_type type)
-{
-       mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-       path->l[level].lock_taken_time = local_clock();
-#endif
-}
-
-static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
-{
-       return level < path->locks_want
-               ? SIX_LOCK_intent
-               : SIX_LOCK_read;
-}
-
-static inline enum btree_node_locked_type
-btree_lock_want(struct btree_path *path, int level)
-{
-       if (level < path->level)
-               return BTREE_NODE_UNLOCKED;
-       if (level < path->locks_want)
-               return BTREE_NODE_INTENT_LOCKED;
-       if (level == path->level)
-               return BTREE_NODE_READ_LOCKED;
-       return BTREE_NODE_UNLOCKED;
-}
-
-static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
-                                             struct btree_path *path, unsigned level)
-{
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-       __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
-                                path->l[level].lock_taken_time,
-                                local_clock());
-#endif
-}
-
-/* unlock: */
-
-void bch2_btree_node_unlock_write(struct btree_trans *,
-                       struct btree_path *, struct btree *);
-
-static inline void btree_node_unlock(struct btree_trans *trans,
-                                    struct btree_path *path, unsigned level)
-{
-       int lock_type = btree_node_locked_type(path, level);
-
-       EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-       if (lock_type != BTREE_NODE_UNLOCKED) {
-               if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) {
-                       bch2_btree_node_unlock_write(trans, path, path->l[level].b);
-                       lock_type = BTREE_NODE_INTENT_LOCKED;
-               }
-               six_unlock_type(&path->l[level].b->c.lock, lock_type);
-               btree_trans_lock_hold_time_update(trans, path, level);
-               mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
-       }
-}
-
-static inline int btree_path_lowest_level_locked(struct btree_path *path)
-{
-       return __ffs(path->nodes_locked) >> 1;
-}
-
-static inline int btree_path_highest_level_locked(struct btree_path *path)
-{
-       return __fls(path->nodes_locked) >> 1;
-}
-
-static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
-                                           struct btree_path *path)
-{
-       btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK);
-
-       while (path->nodes_locked)
-               btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
-}
-
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-static inline void
-__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b)
-{
-       if (!b->c.lock.write_lock_recurse) {
-               struct btree_path *linked;
-               unsigned i;
-
-               trans_for_each_path_with_node(trans, b, linked, i)
-                       linked->l[b->c.level].lock_seq++;
-       }
-
-       six_unlock_write(&b->c.lock);
-}
-
-static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
-                                    struct btree *b)
-{
-       EBUG_ON(path->l[b->c.level].b != b);
-       EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
-       EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
-
-       mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-       __bch2_btree_node_unlock_write(trans, b);
-}
-
-int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
-
-/* lock: */
-
-static inline void trans_set_locked(struct btree_trans *trans, bool try)
-{
-       if (!trans->locked) {
-               lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_);
-               trans->locked = true;
-               trans->last_unlock_ip = 0;
-
-               trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0;
-               current->flags |= PF_MEMALLOC_NOFS;
-       }
-}
-
-static inline void trans_set_unlocked(struct btree_trans *trans)
-{
-       if (trans->locked) {
-               lock_release(&trans->dep_map, _THIS_IP_);
-               trans->locked = false;
-               trans->last_unlock_ip = _RET_IP_;
-
-               if (!trans->pf_memalloc_nofs)
-                       current->flags &= ~PF_MEMALLOC_NOFS;
-       }
-}
-
-static inline int __btree_node_lock_nopath(struct btree_trans *trans,
-                                        struct btree_bkey_cached_common *b,
-                                        enum six_lock_type type,
-                                        bool lock_may_not_fail,
-                                        unsigned long ip)
-{
-       trans->lock_may_not_fail = lock_may_not_fail;
-       trans->lock_must_abort  = false;
-       trans->locking          = b;
-
-       int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
-                                    bch2_six_check_for_deadlock, trans, ip);
-       WRITE_ONCE(trans->locking, NULL);
-       WRITE_ONCE(trans->locking_wait.start_time, 0);
-
-       if (!ret)
-               trace_btree_path_lock(trans, _THIS_IP_, b);
-       return ret;
-}
-
-static inline int __must_check
-btree_node_lock_nopath(struct btree_trans *trans,
-                      struct btree_bkey_cached_common *b,
-                      enum six_lock_type type,
-                      unsigned long ip)
-{
-       return __btree_node_lock_nopath(trans, b, type, false, ip);
-}
-
-static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
-                                        struct btree_bkey_cached_common *b,
-                                        enum six_lock_type type)
-{
-       int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
-
-       BUG_ON(ret);
-}
-
-/*
- * Lock a btree node if we already have it locked on one of our linked
- * iterators:
- */
-static inline bool btree_node_lock_increment(struct btree_trans *trans,
-                                            struct btree_bkey_cached_common *b,
-                                            unsigned level,
-                                            enum btree_node_locked_type want)
-{
-       struct btree_path *path;
-       unsigned i;
-
-       trans_for_each_path(trans, path, i)
-               if (&path->l[level].b->c == b &&
-                   btree_node_locked_type(path, level) >= want) {
-                       six_lock_increment(&b->lock, (enum six_lock_type) want);
-                       return true;
-               }
-
-       return false;
-}
-
-static inline int btree_node_lock(struct btree_trans *trans,
-                       struct btree_path *path,
-                       struct btree_bkey_cached_common *b,
-                       unsigned level,
-                       enum six_lock_type type,
-                       unsigned long ip)
-{
-       int ret = 0;
-
-       EBUG_ON(level >= BTREE_MAX_DEPTH);
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-       if (likely(six_trylock_type(&b->lock, type)) ||
-           btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
-           !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-               path->l[b->level].lock_taken_time = local_clock();
-#endif
-       }
-
-       return ret;
-}
-
-int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
-                                struct btree_bkey_cached_common *b, bool);
-
-static inline int __btree_node_lock_write(struct btree_trans *trans,
-                                         struct btree_path *path,
-                                         struct btree_bkey_cached_common *b,
-                                         bool lock_may_not_fail)
-{
-       EBUG_ON(&path->l[b->level].b->c != b);
-       EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
-       EBUG_ON(!btree_node_intent_locked(path, b->level));
-
-       /*
-        * six locks are unfair, and read locks block while a thread wants a
-        * write lock: thus, we need to tell the cycle detector we have a write
-        * lock _before_ taking the lock:
-        */
-       mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
-
-       return likely(six_trylock_write(&b->lock))
-               ? 0
-               : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
-}
-
-static inline int __must_check
-bch2_btree_node_lock_write(struct btree_trans *trans,
-                          struct btree_path *path,
-                          struct btree_bkey_cached_common *b)
-{
-       return __btree_node_lock_write(trans, path, b, false);
-}
-
-void bch2_btree_node_lock_write_nofail(struct btree_trans *,
-                                      struct btree_path *,
-                                      struct btree_bkey_cached_common *);
-
-/* relock: */
-
-bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
-int __bch2_btree_path_relock(struct btree_trans *,
-                            struct btree_path *, unsigned long);
-
-static inline int bch2_btree_path_relock(struct btree_trans *trans,
-                               struct btree_path *path, unsigned long trace_ip)
-{
-       return btree_node_locked(path, path->level)
-               ? 0
-               : __bch2_btree_path_relock(trans, path, trace_ip);
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
-
-static inline bool bch2_btree_node_relock(struct btree_trans *trans,
-                                         struct btree_path *path, unsigned level)
-{
-       EBUG_ON(btree_node_locked(path, level) &&
-               !btree_node_write_locked(path, level) &&
-               btree_node_locked_type(path, level) != __btree_lock_want(path, level));
-
-       return likely(btree_node_locked(path, level)) ||
-               (!IS_ERR_OR_NULL(path->l[level].b) &&
-                __bch2_btree_node_relock(trans, path, level, true));
-}
-
-static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
-                                                 struct btree_path *path, unsigned level)
-{
-       EBUG_ON(btree_node_locked(path, level) &&
-               btree_node_locked_type_nowrite(path, level) !=
-               __btree_lock_want(path, level));
-
-       return likely(btree_node_locked(path, level)) ||
-               (!IS_ERR_OR_NULL(path->l[level].b) &&
-                __bch2_btree_node_relock(trans, path, level, false));
-}
-
-/* upgrade */
-
-bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned);
-
-static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
-                              struct btree_path *path,
-                              unsigned new_locks_want)
-{
-       return new_locks_want > path->locks_want
-               ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want)
-               : true;
-}
-
-int __bch2_btree_path_upgrade(struct btree_trans *,
-                             struct btree_path *, unsigned);
-
-static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
-                                         struct btree_path *path,
-                                         unsigned new_locks_want)
-{
-       new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
-
-       return likely(path->locks_want >= new_locks_want && path->nodes_locked)
-               ? 0
-               : __bch2_btree_path_upgrade(trans, path, new_locks_want);
-}
-
-/* misc: */
-
-static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
-{
-       EBUG_ON(!btree_node_locked(path, path->level));
-       EBUG_ON(path->uptodate);
-
-       if (!path->should_be_locked) {
-               path->should_be_locked = true;
-               trace_btree_path_should_be_locked(trans, path);
-       }
-}
-
-static inline void __btree_path_set_level_up(struct btree_trans *trans,
-                                     struct btree_path *path,
-                                     unsigned l)
-{
-       btree_node_unlock(trans, path, l);
-       path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
-}
-
-static inline void btree_path_set_level_up(struct btree_trans *trans,
-                                   struct btree_path *path)
-{
-       __btree_path_set_level_up(trans, path, path->level++);
-       btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-}
-
-/* debug */
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
-                               struct btree_path *,
-                               struct btree_bkey_cached_common *b,
-                               unsigned);
-
-int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
-
-void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *);
-void __bch2_trans_verify_locks(struct btree_trans *);
-
-static inline void bch2_btree_path_verify_locks(struct btree_trans *trans,
-                                               struct btree_path *path)
-{
-       if (static_branch_unlikely(&bch2_debug_check_btree_locking))
-               __bch2_btree_path_verify_locks(trans, path);
-}
-
-static inline void bch2_trans_verify_locks(struct btree_trans *trans)
-{
-       if (static_branch_unlikely(&bch2_debug_check_btree_locking))
-               __bch2_trans_verify_locks(trans);
-}
-
-#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
deleted file mode 100644 (file)
index a3fb07c..0000000
+++ /dev/null
@@ -1,611 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_journal_iter.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "error.h"
-#include "journal_io.h"
-#include "recovery_passes.h"
-
-#include <linux/kthread.h>
-#include <linux/min_heap.h>
-#include <linux/sched/sysctl.h>
-#include <linux/sort.h>
-
-struct find_btree_nodes_worker {
-       struct closure          *cl;
-       struct find_btree_nodes *f;
-       struct bch_dev          *ca;
-};
-
-static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
-{
-       bch2_btree_id_level_to_text(out, n->btree_id, n->level);
-       prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
-                  n->seq, n->journal_seq, n->cookie);
-       bch2_bpos_to_text(out, n->min_key);
-       prt_str(out, "-");
-       bch2_bpos_to_text(out, n->max_key);
-
-       if (n->range_updated)
-               prt_str(out, " range updated");
-
-       for (unsigned i = 0; i < n->nr_ptrs; i++) {
-               prt_char(out, ' ');
-               bch2_extent_ptr_to_text(out, c, n->ptrs + i);
-       }
-}
-
-static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
-{
-       printbuf_indent_add(out, 2);
-       darray_for_each(nodes, i) {
-               found_btree_node_to_text(out, c, i);
-               prt_newline(out);
-       }
-       printbuf_indent_sub(out, 2);
-}
-
-static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
-{
-       struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
-
-       set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
-       bp->k.p                 = f->max_key;
-       bp->v.seq               = cpu_to_le64(f->cookie);
-       bp->v.sectors_written   = 0;
-       bp->v.flags             = 0;
-       bp->v.sectors_written   = cpu_to_le16(f->sectors_written);
-       bp->v.min_key           = f->min_key;
-       SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
-       memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
-}
-
-static inline u64 bkey_journal_seq(struct bkey_s_c k)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_inode_v3:
-               return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
-       default:
-               return 0;
-       }
-}
-
-static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
-{
-       const struct found_btree_node *l = _l;
-       const struct found_btree_node *r = _r;
-
-       return  cmp_int(l->btree_id,    r->btree_id) ?:
-               cmp_int(l->level,       r->level) ?:
-               cmp_int(l->cookie,      r->cookie);
-}
-
-/*
- * Given two found btree nodes, if their sequence numbers are equal, take the
- * one that's readable:
- */
-static int found_btree_node_cmp_time(const struct found_btree_node *l,
-                                    const struct found_btree_node *r)
-{
-       return  cmp_int(l->seq, r->seq) ?:
-               cmp_int(l->journal_seq, r->journal_seq);
-}
-
-static int found_btree_node_cmp_pos(const void *_l, const void *_r)
-{
-       const struct found_btree_node *l = _l;
-       const struct found_btree_node *r = _r;
-
-       return  cmp_int(l->btree_id,    r->btree_id) ?:
-              -cmp_int(l->level,       r->level) ?:
-               bpos_cmp(l->min_key,    r->min_key) ?:
-              -found_btree_node_cmp_time(l, r);
-}
-
-static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
-{
-       return found_btree_node_cmp_pos(l, r) < 0;
-}
-
-static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
-{
-       struct found_btree_node *l = _l;
-       struct found_btree_node *r = _r;
-
-       swap(*l, *r);
-}
-
-static const struct min_heap_callbacks found_btree_node_heap_cbs = {
-       .less = found_btree_node_cmp_pos_less,
-       .swp = found_btree_node_swap,
-};
-
-static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
-                               struct btree *b, struct bio *bio, u64 offset)
-{
-       struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
-       struct btree_node *bn = b->data;
-
-       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
-       bio->bi_iter.bi_sector  = offset;
-       bch2_bio_map(bio, b->data, c->opts.block_size);
-
-       u64 submit_time = local_clock();
-       submit_bio_wait(bio);
-       bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
-
-       if (bio->bi_status) {
-               bch_err_dev_ratelimited(ca,
-                               "IO error in try_read_btree_node() at %llu: %s",
-                               offset, bch2_blk_status_to_str(bio->bi_status));
-               return;
-       }
-
-       if (le64_to_cpu(bn->magic) != bset_magic(c))
-               return;
-
-       if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
-               if (!c->chacha20_key_set)
-                       return;
-
-               struct nonce nonce = btree_nonce(&bn->keys, 0);
-               unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
-               bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
-       }
-
-       if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
-               return;
-
-       if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
-               return;
-
-       if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
-               return;
-
-       rcu_read_lock();
-       struct found_btree_node n = {
-               .btree_id       = BTREE_NODE_ID(bn),
-               .level          = BTREE_NODE_LEVEL(bn),
-               .seq            = BTREE_NODE_SEQ(bn),
-               .cookie         = le64_to_cpu(bn->keys.seq),
-               .min_key        = bn->min_key,
-               .max_key        = bn->max_key,
-               .nr_ptrs        = 1,
-               .ptrs[0].type   = 1 << BCH_EXTENT_ENTRY_ptr,
-               .ptrs[0].offset = offset,
-               .ptrs[0].dev    = ca->dev_idx,
-               .ptrs[0].gen    = bucket_gen_get(ca, sector_to_bucket(ca, offset)),
-       };
-       rcu_read_unlock();
-
-       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
-       bio->bi_iter.bi_sector  = offset;
-       bch2_bio_map(bio, b->data, c->opts.btree_node_size);
-
-       submit_time = local_clock();
-       submit_bio_wait(bio);
-       bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
-
-       found_btree_node_to_key(&b->key, &n);
-
-       CLASS(printbuf, buf)();
-       if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) {
-               /* read_done will swap out b->data for another buffer */
-               bn = b->data;
-               /*
-                * Grab journal_seq here because we want the max journal_seq of
-                * any bset; read_done sorts down to a single set and picks the
-                * max journal_seq
-                */
-               n.journal_seq           = le64_to_cpu(bn->keys.journal_seq),
-               n.sectors_written       = b->written;
-
-               mutex_lock(&f->lock);
-               if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
-                       bch_err(c, "try_read_btree_node() can't handle endian conversion");
-                       f->ret = -EINVAL;
-                       goto unlock;
-               }
-
-               if (darray_push(&f->nodes, n))
-                       f->ret = -ENOMEM;
-unlock:
-               mutex_unlock(&f->lock);
-       }
-}
-
-static int read_btree_nodes_worker(void *p)
-{
-       struct find_btree_nodes_worker *w = p;
-       struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
-       struct bch_dev *ca = w->ca;
-       unsigned long last_print = jiffies;
-       struct btree *b = NULL;
-       struct bio *bio = NULL;
-
-       b = __bch2_btree_node_mem_alloc(c);
-       if (!b) {
-               bch_err(c, "read_btree_nodes_worker: error allocating buf");
-               w->f->ret = -ENOMEM;
-               goto err;
-       }
-
-       bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL);
-       if (!bio) {
-               bch_err(c, "read_btree_nodes_worker: error allocating bio");
-               w->f->ret = -ENOMEM;
-               goto err;
-       }
-
-       for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
-               for (unsigned bucket_offset = 0;
-                    bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
-                    bucket_offset += btree_sectors(c)) {
-                       if (time_after(jiffies, last_print + HZ * 30)) {
-                               u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
-                               u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
-
-                               bch_info(ca, "%s: %2u%% done", __func__,
-                                        (unsigned) div64_u64(cur_sector * 100, end_sector));
-                               last_print = jiffies;
-                       }
-
-                       u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
-
-                       if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
-                           !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
-                               continue;
-
-                       try_read_btree_node(w->f, ca, b, bio, sector);
-               }
-err:
-       if (b)
-               __btree_node_data_free(b);
-       kfree(b);
-       bio_put(bio);
-       enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
-       closure_put(w->cl);
-       kfree(w);
-       return 0;
-}
-
-static int read_btree_nodes(struct find_btree_nodes *f)
-{
-       struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
-       struct closure cl;
-       int ret = 0;
-
-       closure_init_stack(&cl);
-
-       for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) {
-               if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
-                       continue;
-
-               struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
-               if (!w) {
-                       enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
-                       ret = -ENOMEM;
-                       goto err;
-               }
-
-               w->cl           = &cl;
-               w->f            = f;
-               w->ca           = ca;
-
-               struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
-               ret = PTR_ERR_OR_ZERO(t);
-               if (ret) {
-                       enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
-                       kfree(w);
-                       bch_err_msg(c, ret, "starting kthread");
-                       break;
-               }
-
-               closure_get(&cl);
-               enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
-               wake_up_process(t);
-       }
-err:
-       while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2))
-               ;
-       return f->ret ?: ret;
-}
-
-static bool nodes_overlap(const struct found_btree_node *l,
-                         const struct found_btree_node *r)
-{
-       return (l->btree_id     == r->btree_id &&
-               l->level        == r->level &&
-               bpos_gt(l->max_key, r->min_key));
-}
-
-static int handle_overwrites(struct bch_fs *c,
-                            struct found_btree_node *l,
-                            found_btree_nodes *nodes_heap)
-{
-       struct found_btree_node *r;
-
-       while ((r = min_heap_peek(nodes_heap)) &&
-              nodes_overlap(l, r)) {
-               int cmp = found_btree_node_cmp_time(l, r);
-
-               if (cmp > 0) {
-                       if (bpos_cmp(l->max_key, r->max_key) >= 0)
-                               min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
-                       else {
-                               r->range_updated = true;
-                               r->min_key = bpos_successor(l->max_key);
-                               r->range_updated = true;
-                               min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
-                       }
-               } else if (cmp < 0) {
-                       BUG_ON(bpos_eq(l->min_key, r->min_key));
-
-                       l->max_key = bpos_predecessor(r->min_key);
-                       l->range_updated = true;
-               } else if (r->level) {
-                       min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
-               } else {
-                       if (bpos_cmp(l->max_key, r->max_key) >= 0)
-                               min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
-                       else {
-                               r->range_updated = true;
-                               r->min_key = bpos_successor(l->max_key);
-                               r->range_updated = true;
-                               min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
-                       }
-               }
-
-               cond_resched();
-       }
-
-       return 0;
-}
-
-int bch2_scan_for_btree_nodes(struct bch_fs *c)
-{
-       struct find_btree_nodes *f = &c->found_btree_nodes;
-       struct printbuf buf = PRINTBUF;
-       found_btree_nodes nodes_heap = {};
-       size_t dst;
-       int ret = 0;
-
-       if (f->nodes.nr)
-               return 0;
-
-       mutex_init(&f->lock);
-
-       ret = read_btree_nodes(f);
-       if (ret)
-               return ret;
-
-       if (!f->nodes.nr) {
-               bch_err(c, "%s: no btree nodes found", __func__);
-               ret = -EINVAL;
-               goto err;
-       }
-
-       if (0 && c->opts.verbose) {
-               printbuf_reset(&buf);
-               prt_printf(&buf, "%s: nodes found:\n", __func__);
-               found_btree_nodes_to_text(&buf, c, f->nodes);
-               bch2_print_str(c, KERN_INFO, buf.buf);
-       }
-
-       sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
-
-       dst = 0;
-       darray_for_each(f->nodes, i) {
-               struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
-
-               if (prev &&
-                   prev->cookie == i->cookie) {
-                       if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
-                               bch_err(c, "%s: found too many replicas for btree node", __func__);
-                               ret = -EINVAL;
-                               goto err;
-                       }
-                       prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
-               } else {
-                       f->nodes.data[dst++] = *i;
-               }
-       }
-       f->nodes.nr = dst;
-
-       sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
-
-       if (0 && c->opts.verbose) {
-               printbuf_reset(&buf);
-               prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
-               found_btree_nodes_to_text(&buf, c, f->nodes);
-               bch2_print_str(c, KERN_INFO, buf.buf);
-       }
-
-       swap(nodes_heap, f->nodes);
-
-       {
-               /* darray must have same layout as a heap */
-               min_heap_char real_heap;
-               BUILD_BUG_ON(sizeof(nodes_heap.nr)      != sizeof(real_heap.nr));
-               BUILD_BUG_ON(sizeof(nodes_heap.size)    != sizeof(real_heap.size));
-               BUILD_BUG_ON(offsetof(found_btree_nodes, nr)    != offsetof(min_heap_char, nr));
-               BUILD_BUG_ON(offsetof(found_btree_nodes, size)  != offsetof(min_heap_char, size));
-       }
-
-       min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
-
-       if (nodes_heap.nr) {
-               ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
-               if (ret)
-                       goto err;
-
-               min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
-       }
-
-       while (true) {
-               ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
-               if (ret)
-                       goto err;
-
-               if (!nodes_heap.nr)
-                       break;
-
-               ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
-               if (ret)
-                       goto err;
-
-               min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
-       }
-
-       for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
-               BUG_ON(nodes_overlap(n, n + 1));
-
-       if (0 && c->opts.verbose) {
-               printbuf_reset(&buf);
-               prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
-               found_btree_nodes_to_text(&buf, c, f->nodes);
-               bch2_print_str(c, KERN_INFO, buf.buf);
-       } else {
-               bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
-       }
-
-       eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
-err:
-       darray_exit(&nodes_heap);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
-{
-       const struct found_btree_node *l = _l;
-       const struct found_btree_node *r = _r;
-
-       return  cmp_int(l->btree_id,    r->btree_id) ?:
-              -cmp_int(l->level,       r->level) ?:
-               bpos_cmp(l->max_key,    r->min_key);
-}
-
-#define for_each_found_btree_node_in_range(_f, _search, _idx)                          \
-       for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr,         \
-                                       sizeof((_f)->nodes.data[0]),                    \
-                                       found_btree_node_range_start_cmp, &search);     \
-            _idx < (_f)->nodes.nr &&                                                   \
-            (_f)->nodes.data[_idx].btree_id == _search.btree_id &&                     \
-            (_f)->nodes.data[_idx].level == _search.level &&                           \
-            bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key);                  \
-            _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
-
-bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
-{
-       struct find_btree_nodes *f = &c->found_btree_nodes;
-
-       struct found_btree_node search = {
-               .btree_id       = b->c.btree_id,
-               .level          = b->c.level,
-               .min_key        = b->data->min_key,
-               .max_key        = b->key.k.p,
-       };
-
-       for_each_found_btree_node_in_range(f, search, idx)
-               if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
-                       return true;
-       return false;
-}
-
-int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
-{
-       int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
-       if (ret)
-               return ret;
-
-       struct found_btree_node search = {
-               .btree_id       = btree,
-               .level          = 0,
-               .min_key        = POS_MIN,
-               .max_key        = SPOS_MAX,
-       };
-
-       for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
-               return true;
-       return false;
-}
-
-int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
-                          unsigned level, struct bpos node_min, struct bpos node_max)
-{
-       if (btree_id_is_alloc(btree))
-               return 0;
-
-       struct find_btree_nodes *f = &c->found_btree_nodes;
-
-       int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
-       if (ret)
-               return ret;
-
-       if (c->opts.verbose) {
-               struct printbuf buf = PRINTBUF;
-
-               prt_str(&buf, "recovery ");
-               bch2_btree_id_level_to_text(&buf, btree, level);
-               prt_str(&buf, " ");
-               bch2_bpos_to_text(&buf, node_min);
-               prt_str(&buf, " - ");
-               bch2_bpos_to_text(&buf, node_max);
-
-               bch_info(c, "%s(): %s", __func__, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       struct found_btree_node search = {
-               .btree_id       = btree,
-               .level          = level,
-               .min_key        = node_min,
-               .max_key        = node_max,
-       };
-
-       for_each_found_btree_node_in_range(f, search, idx) {
-               struct found_btree_node n = f->nodes.data[idx];
-
-               n.range_updated |= bpos_lt(n.min_key, node_min);
-               n.min_key = bpos_max(n.min_key, node_min);
-
-               n.range_updated |= bpos_gt(n.max_key, node_max);
-               n.max_key = bpos_min(n.max_key, node_max);
-
-               struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
-               found_btree_node_to_key(&tmp.k, &n);
-
-               if (c->opts.verbose) {
-                       struct printbuf buf = PRINTBUF;
-                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
-                       bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
-                       printbuf_exit(&buf);
-               }
-
-               BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
-                                         (struct bkey_validate_context) {
-                                               .from   = BKEY_VALIDATE_btree_node,
-                                               .level  = level + 1,
-                                               .btree  = btree,
-                                         }));
-
-               ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
-{
-       darray_exit(&f->nodes);
-}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
deleted file mode 100644 (file)
index 66e6f9e..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
-#define _BCACHEFS_BTREE_NODE_SCAN_H
-
-int bch2_scan_for_btree_nodes(struct bch_fs *);
-bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
-int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
-int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
-void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
-
-#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
deleted file mode 100644 (file)
index 2811b68..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
-#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
-
-#include "darray.h"
-
-struct found_btree_node {
-       bool                    range_updated:1;
-       u8                      btree_id;
-       u8                      level;
-       unsigned                sectors_written;
-       u32                     seq;
-       u64                     journal_seq;
-       u64                     cookie;
-
-       struct bpos             min_key;
-       struct bpos             max_key;
-
-       unsigned                nr_ptrs;
-       struct bch_extent_ptr   ptrs[BCH_REPLICAS_MAX];
-};
-
-typedef DARRAY(struct found_btree_node)        found_btree_nodes;
-
-struct find_btree_nodes {
-       int                     ret;
-       struct mutex            lock;
-       found_btree_nodes       nodes;
-};
-
-#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
deleted file mode 100644 (file)
index 639ef75..0000000
+++ /dev/null
@@ -1,1121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "disk_accounting.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "replicas.h"
-#include "snapshot.h"
-
-#include <linux/prefetch.h>
-#include <linux/string_helpers.h>
-
-static const char * const trans_commit_flags_strs[] = {
-#define x(n, ...) #n,
-       BCH_TRANS_COMMIT_FLAGS()
-#undef x
-       NULL
-};
-
-void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags)
-{
-       enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-
-       prt_printf(out, "watermark=%s", bch2_watermarks[watermark]);
-
-       flags >>= BCH_WATERMARK_BITS;
-       if (flags) {
-               prt_char(out, ' ');
-               bch2_prt_bitflags(out, trans_commit_flags_strs, flags);
-       }
-}
-
-static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       struct bch_fs *c = trans->c;
-       struct bkey u;
-       struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
-
-       if (unlikely(trans->journal_replay_not_finished)) {
-               struct bkey_i *j_k =
-                       bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
-
-               if (j_k)
-                       k = bkey_i_to_s_c(j_k);
-       }
-
-       u = *k.k;
-       u.needs_whiteout = i->old_k.needs_whiteout;
-
-       BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
-       BUG_ON(i->old_v != k.v);
-#endif
-}
-
-static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-       return (trans->paths + i->path)->l + i->level;
-}
-
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
-                                    struct btree_insert_entry *i)
-{
-       return i != trans->updates &&
-               insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b;
-}
-
-static inline bool same_leaf_as_next(struct btree_trans *trans,
-                                    struct btree_insert_entry *i)
-{
-       return i + 1 < trans->updates + trans->nr_updates &&
-               insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b;
-}
-
-inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
-                                          struct btree_path *path,
-                                          struct btree *b)
-{
-       struct bch_fs *c = trans->c;
-
-       if (unlikely(btree_node_just_written(b)) &&
-           bch2_btree_post_write_cleanup(c, b))
-               bch2_trans_node_reinit_iter(trans, b);
-
-       /*
-        * If the last bset has been written, or if it's gotten too big - start
-        * a new bset to insert into:
-        */
-       if (want_new_bset(c, b))
-               bch2_btree_init_next(trans, b);
-}
-
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-       while (--i >= trans->updates) {
-               if (same_leaf_as_prev(trans, i))
-                       continue;
-
-               bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
-       }
-
-       trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-       return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int bch2_trans_lock_write(struct btree_trans *trans)
-{
-       EBUG_ON(trans->write_locked);
-
-       trans_for_each_update(trans, i) {
-               if (same_leaf_as_prev(trans, i))
-                       continue;
-
-               if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c))
-                       return trans_lock_write_fail(trans, i);
-
-               if (!i->cached)
-                       bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
-       }
-
-       trans->write_locked = true;
-       return 0;
-}
-
-static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans)
-{
-       if (likely(trans->write_locked)) {
-               trans_for_each_update(trans, i)
-                       if (btree_node_locked_type(trans->paths + i->path, i->level) ==
-                           BTREE_NODE_WRITE_LOCKED)
-                               bch2_btree_node_unlock_write_inlined(trans,
-                                               trans->paths + i->path, insert_l(trans, i)->b);
-               trans->write_locked = false;
-       }
-}
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_trans *trans,
-                               struct btree_path *path,
-                               struct btree *b,
-                               struct btree_node_iter *node_iter,
-                               struct bkey_i *insert)
-{
-       struct bkey_packed *k;
-       unsigned clobber_u64s = 0, new_u64s = 0;
-
-       EBUG_ON(btree_node_just_written(b));
-       EBUG_ON(bset_written(b, btree_bset_last(b)));
-       EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-       EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
-       EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
-       EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
-       EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
-       kmsan_check_memory(insert, bkey_bytes(&insert->k));
-
-       k = bch2_btree_node_iter_peek_all(node_iter, b);
-       if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
-               k = NULL;
-
-       /* @k is the key being overwritten/deleted, if any: */
-       EBUG_ON(k && bkey_deleted(k));
-
-       /* Deleting, but not found? nothing to do: */
-       if (bkey_deleted(&insert->k) && !k)
-               return false;
-
-       if (bkey_deleted(&insert->k)) {
-               /* Deleting: */
-               btree_account_key_drop(b, k);
-               k->type = KEY_TYPE_deleted;
-
-               if (k->needs_whiteout)
-                       push_whiteout(b, insert->k.p);
-               k->needs_whiteout = false;
-
-               if (k >= btree_bset_last(b)->start) {
-                       clobber_u64s = k->u64s;
-                       bch2_bset_delete(b, k, clobber_u64s);
-                       goto fix_iter;
-               } else {
-                       bch2_btree_path_fix_key_modified(trans, b, k);
-               }
-
-               return true;
-       }
-
-       if (k) {
-               /* Overwriting: */
-               btree_account_key_drop(b, k);
-               k->type = KEY_TYPE_deleted;
-
-               insert->k.needs_whiteout = k->needs_whiteout;
-               k->needs_whiteout = false;
-
-               if (k >= btree_bset_last(b)->start) {
-                       clobber_u64s = k->u64s;
-                       goto overwrite;
-               } else {
-                       bch2_btree_path_fix_key_modified(trans, b, k);
-               }
-       }
-
-       k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-overwrite:
-       bch2_bset_insert(b, k, insert, clobber_u64s);
-       new_u64s = k->u64s;
-fix_iter:
-       if (clobber_u64s != new_u64s)
-               bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
-                                        clobber_u64s, new_u64s);
-       return true;
-}
-
-static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
-                              unsigned i, u64 seq)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct btree_write *w = container_of(pin, struct btree_write, journal);
-       struct btree *b = container_of(w, struct btree, writes[i]);
-       struct btree_trans *trans = bch2_trans_get(c);
-       unsigned long old, new;
-       unsigned idx = w - b->writes;
-
-       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-
-       old = READ_ONCE(b->flags);
-       do {
-               new = old;
-
-               if (!(old & (1 << BTREE_NODE_dirty)) ||
-                   !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
-                   w->journal.seq != seq)
-                       break;
-
-               new &= ~BTREE_WRITE_TYPE_MASK;
-               new |= BTREE_WRITE_journal_reclaim;
-               new |= 1 << BTREE_NODE_need_write;
-       } while (!try_cmpxchg(&b->flags, &old, new));
-
-       btree_node_write_if_need(trans, b, SIX_LOCK_read);
-       six_unlock_read(&b->c.lock);
-
-       bch2_trans_put(trans);
-       return 0;
-}
-
-int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-       return __btree_node_flush(j, pin, 0, seq);
-}
-
-int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-       return __btree_node_flush(j, pin, 1, seq);
-}
-
-inline void bch2_btree_add_journal_pin(struct bch_fs *c,
-                                      struct btree *b, u64 seq)
-{
-       struct btree_write *w = btree_current_write(b);
-
-       bch2_journal_pin_add(&c->journal, seq, &w->journal,
-                            btree_node_write_idx(b) == 0
-                            ? bch2_btree_node_flush0
-                            : bch2_btree_node_flush1);
-}
-
-/**
- * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
- * @trans:             btree transaction object
- * @path:              path pointing to @insert's pos
- * @insert:            key to insert
- * @journal_seq:       sequence number of journal reservation
- */
-inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
-                                      struct btree_path *path,
-                                      struct bkey_i *insert,
-                                      u64 journal_seq)
-{
-       struct bch_fs *c = trans->c;
-       struct btree *b = path_l(path)->b;
-       struct bset_tree *t = bset_tree_last(b);
-       struct bset *i = bset(b, t);
-       int old_u64s = bset_u64s(t);
-       int old_live_u64s = b->nr.live_u64s;
-       int live_u64s_added, u64s_added;
-
-       if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
-                                       &path_l(path)->iter, insert)))
-               return;
-
-       i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
-
-       bch2_btree_add_journal_pin(c, b, journal_seq);
-
-       if (unlikely(!btree_node_dirty(b))) {
-               EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-               set_btree_node_dirty_acct(c, b);
-       }
-
-       live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-       u64s_added = (int) bset_u64s(t) - old_u64s;
-
-       if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
-               b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
-       if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
-               b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
-       if (u64s_added > live_u64s_added &&
-           bch2_maybe_compact_whiteouts(c, b))
-               bch2_trans_node_reinit_iter(trans, b);
-}
-
-/* Cached btree updates: */
-
-/* Normal update interface: */
-
-static inline void btree_insert_entry_checks(struct btree_trans *trans,
-                                            struct btree_insert_entry *i)
-{
-       struct btree_path *path = trans->paths + i->path;
-
-       BUG_ON(!bpos_eq(i->k->k.p, path->pos));
-       BUG_ON(i->cached        != path->cached);
-       BUG_ON(i->level         != path->level);
-       BUG_ON(i->btree_id      != path->btree_id);
-       BUG_ON(i->bkey_type     != __btree_node_type(path->level, path->btree_id));
-       EBUG_ON(!i->level &&
-               btree_type_has_snapshots(i->btree_id) &&
-               !(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
-               test_bit(JOURNAL_replay_done, &trans->c->journal.flags) &&
-               i->k->k.p.snapshot &&
-               bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
-}
-
-static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
-                                                     unsigned flags)
-{
-       return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
-                                   trans->journal_u64s, flags, trans);
-}
-
-#define JSET_ENTRY_LOG_U64s            4
-
-static noinline void journal_transaction_name(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct journal *j = &c->journal;
-       struct jset_entry *entry =
-               bch2_journal_add_entry(j, &trans->journal_res,
-                                      BCH_JSET_ENTRY_log, 0, 0,
-                                      JSET_ENTRY_LOG_U64s);
-       struct jset_entry_log *l =
-               container_of(entry, struct jset_entry_log, entry);
-
-       memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64),
-                      trans->fn, strlen(trans->fn), 0);
-}
-
-static inline int btree_key_can_insert(struct btree_trans *trans,
-                                      struct btree *b, unsigned u64s)
-{
-       if (!bch2_btree_node_insert_fits(b, u64s))
-               return bch_err_throw(trans->c, btree_insert_btree_node_full);
-
-       return 0;
-}
-
-noinline static int
-btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
-                                    struct btree_path *path, unsigned new_u64s)
-{
-       struct bkey_cached *ck = (void *) path->l[0].b;
-       struct bkey_i *new_k;
-       int ret;
-
-       bch2_trans_unlock_updates_write(trans);
-       bch2_trans_unlock(trans);
-
-       new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
-       if (!new_k) {
-               struct bch_fs *c = trans->c;
-               bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
-                       bch2_btree_id_str(path->btree_id), new_u64s);
-               return bch_err_throw(c, ENOMEM_btree_key_cache_insert);
-       }
-
-       ret =   bch2_trans_relock(trans) ?:
-               bch2_trans_lock_write(trans);
-       if (unlikely(ret)) {
-               kfree(new_k);
-               return ret;
-       }
-
-       memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
-
-       trans_for_each_update(trans, i)
-               if (i->old_v == &ck->k->v)
-                       i->old_v = &new_k->v;
-
-       kfree(ck->k);
-       ck->u64s        = new_u64s;
-       ck->k           = new_k;
-       return 0;
-}
-
-static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
-                                      struct btree_path *path, unsigned u64s)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_cached *ck = (void *) path->l[0].b;
-       unsigned new_u64s;
-       struct bkey_i *new_k;
-       unsigned watermark = flags & BCH_WATERMARK_MASK;
-
-       EBUG_ON(path->level);
-
-       if (watermark < BCH_WATERMARK_reclaim &&
-           !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-           bch2_btree_key_cache_must_wait(c))
-               return bch_err_throw(c, btree_insert_need_journal_reclaim);
-
-       /*
-        * bch2_varint_decode can read past the end of the buffer by at most 7
-        * bytes (it won't be used):
-        */
-       u64s += 1;
-
-       if (u64s <= ck->u64s)
-               return 0;
-
-       new_u64s        = roundup_pow_of_two(u64s);
-       new_k           = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
-       if (unlikely(!new_k))
-               return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
-
-       trans_for_each_update(trans, i)
-               if (i->old_v == &ck->k->v)
-                       i->old_v = &new_k->v;
-
-       ck->u64s        = new_u64s;
-       ck->k           = new_k;
-       return 0;
-}
-
-/* Triggers: */
-
-static int run_one_mem_trigger(struct btree_trans *trans,
-                              struct btree_insert_entry *i,
-                              unsigned flags)
-{
-       verify_update_old_key(trans, i);
-
-       if (unlikely(flags & BTREE_TRIGGER_norun))
-               return 0;
-
-       struct bkey_s_c old = { &i->old_k, i->old_v };
-       struct bkey_i *new = i->k;
-       const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-       const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-
-       if (old_ops->trigger == new_ops->trigger)
-               return bch2_key_trigger(trans, i->btree_id, i->level,
-                               old, bkey_i_to_s(new),
-                               BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
-       else
-               return bch2_key_trigger_new(trans, i->btree_id, i->level,
-                               bkey_i_to_s(new), flags) ?:
-                      bch2_key_trigger_old(trans, i->btree_id, i->level,
-                               old, flags);
-}
-
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-       verify_update_old_key(trans, i);
-
-       if ((i->flags & BTREE_TRIGGER_norun) ||
-           !btree_node_type_has_trans_triggers(i->bkey_type))
-               return 0;
-
-       /*
-        * Transactional triggers create new btree_insert_entries, so we can't
-        * pass them a pointer to a btree_insert_entry, that memory is going to
-        * move:
-        */
-       struct bkey old_k = i->old_k;
-       struct bkey_s_c old = { &old_k, i->old_v };
-       const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-       const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-       unsigned flags = i->flags|BTREE_TRIGGER_transactional;
-
-       if (!i->insert_trigger_run &&
-           !i->overwrite_trigger_run &&
-           old_ops->trigger == new_ops->trigger) {
-               i->overwrite_trigger_run = true;
-               i->insert_trigger_run = true;
-               return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
-                                       BTREE_TRIGGER_insert|
-                                       BTREE_TRIGGER_overwrite|flags) ?: 1;
-       } else if (!i->overwrite_trigger_run) {
-               i->overwrite_trigger_run = true;
-               return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
-       } else if (!i->insert_trigger_run) {
-               i->insert_trigger_run = true;
-               return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
-       } else {
-               return 0;
-       }
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
-       unsigned sort_id_start = 0;
-
-       while (sort_id_start < trans->nr_updates) {
-               unsigned i, sort_id = trans->updates[sort_id_start].sort_order;
-               bool trans_trigger_run;
-
-               /*
-                * For a given btree, this algorithm runs insert triggers before
-                * overwrite triggers: this is so that when extents are being
-                * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop
-                * references before they are re-added.
-                *
-                * Running triggers will append more updates to the list of
-                * updates as we're walking it:
-                */
-               do {
-                       trans_trigger_run = false;
-
-                       for (i = sort_id_start;
-                            i < trans->nr_updates && trans->updates[i].sort_order <= sort_id;
-                            i++) {
-                               if (trans->updates[i].sort_order < sort_id) {
-                                       sort_id_start = i;
-                                       continue;
-                               }
-
-                               int ret = run_one_trans_trigger(trans, trans->updates + i);
-                               if (ret < 0)
-                                       return ret;
-                               if (ret)
-                                       trans_trigger_run = true;
-                       }
-               } while (trans_trigger_run);
-
-               sort_id_start = i;
-       }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans_for_each_update(trans, i)
-               BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
-                      btree_node_type_has_trans_triggers(i->bkey_type) &&
-                      (!i->insert_trigger_run || !i->overwrite_trigger_run));
-#endif
-       return 0;
-}
-
-static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
-{
-       trans_for_each_update(trans, i)
-               if (btree_node_type_has_triggers(i->bkey_type) &&
-                   gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) {
-                       int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
-                       if (ret)
-                               return ret;
-               }
-
-       return 0;
-}
-
-static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
-                              struct btree_insert_entry **stopped_at,
-                              unsigned long trace_ip)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_trans_commit_hook *h;
-       unsigned u64s = 0;
-       int ret = 0;
-
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-#if 0
-       /* todo: bring back dynamic fault injection */
-       if (race_fault()) {
-               trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
-               return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
-       }
-#endif
-       /*
-        * Check if the insert will fit in the leaf node with the write lock
-        * held, otherwise another thread could write the node changing the
-        * amount of space available:
-        */
-
-       prefetch(&trans->c->journal.flags);
-
-       trans_for_each_update(trans, i) {
-               /* Multiple inserts might go to same leaf: */
-               if (!same_leaf_as_prev(trans, i))
-                       u64s = 0;
-
-               u64s += i->k->k.u64s;
-               ret = !i->cached
-                       ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s)
-                       : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s);
-               if (ret) {
-                       *stopped_at = i;
-                       return ret;
-               }
-
-               i->k->k.needs_whiteout = false;
-       }
-
-       /*
-        * Don't get journal reservation until after we know insert will
-        * succeed:
-        */
-       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
-               ret = bch2_trans_journal_res_get(trans,
-                               (flags & BCH_WATERMARK_MASK)|
-                               JOURNAL_RES_GET_NONBLOCK);
-               if (ret)
-                       return ret;
-
-               if (unlikely(trans->journal_transaction_names))
-                       journal_transaction_name(trans);
-       }
-
-       /*
-        * Not allowed to fail after we've gotten our journal reservation - we
-        * have to use it:
-        */
-
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-           !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
-               if (static_branch_unlikely(&bch2_journal_seq_verify))
-                       trans_for_each_update(trans, i)
-                               i->k->k.bversion.lo = trans->journal_res.seq;
-               else if (static_branch_unlikely(&bch2_inject_invalid_keys))
-                       trans_for_each_update(trans, i)
-                               i->k->k.bversion = MAX_VERSION;
-       }
-
-       h = trans->hooks;
-       while (h) {
-               ret = h->fn(trans, h);
-               if (ret)
-                       return ret;
-               h = h->next;
-       }
-
-       struct bkey_i *accounting;
-
-       percpu_down_read(&c->mark_lock);
-       for (accounting = btree_trans_subbuf_base(trans, &trans->accounting);
-            accounting != btree_trans_subbuf_top(trans, &trans->accounting);
-            accounting = bkey_next(accounting)) {
-               ret = bch2_accounting_trans_commit_hook(trans,
-                                       bkey_i_to_accounting(accounting), flags);
-               if (ret)
-                       goto revert_fs_usage;
-       }
-       percpu_up_read(&c->mark_lock);
-
-       /* XXX: we only want to run this if deltas are nonzero */
-       bch2_trans_account_disk_usage_change(trans);
-
-       trans_for_each_update(trans, i)
-               if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
-                       ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
-                       if (ret)
-                               goto fatal_err;
-               }
-
-       if (unlikely(c->gc_pos.phase)) {
-               ret = bch2_trans_commit_run_gc_triggers(trans);
-               if  (ret)
-                       goto fatal_err;
-       }
-
-       struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit };
-
-       if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
-               validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
-       for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
-            i != btree_trans_journal_entries_top(trans);
-            i = vstruct_next(i)) {
-               ret = bch2_journal_entry_validate(c, NULL, i,
-                                                 bcachefs_metadata_version_current,
-                                                 CPU_BIG_ENDIAN, validate_context);
-               if (unlikely(ret)) {
-                       bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
-                                               trans->fn);
-                       goto fatal_err;
-               }
-       }
-
-       trans_for_each_update(trans, i) {
-               validate_context.level  = i->level;
-               validate_context.btree  = i->btree_id;
-
-               ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context);
-               if (unlikely(ret)){
-                       bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
-                                               trans->fn, (void *) i->ip_allocated);
-                       goto fatal_err;
-               }
-               btree_insert_entry_checks(trans, i);
-       }
-
-       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
-               struct journal *j = &c->journal;
-               struct jset_entry *entry;
-
-               trans_for_each_update(trans, i) {
-                       if (i->key_cache_already_flushed)
-                               continue;
-
-                       if (i->flags & BTREE_UPDATE_nojournal)
-                               continue;
-
-                       verify_update_old_key(trans, i);
-
-                       if (trans->journal_transaction_names) {
-                               entry = bch2_journal_add_entry(j, &trans->journal_res,
-                                                      BCH_JSET_ENTRY_overwrite,
-                                                      i->btree_id, i->level,
-                                                      i->old_k.u64s);
-                               bkey_reassemble((struct bkey_i *) entry->start,
-                                               (struct bkey_s_c) { &i->old_k, i->old_v });
-                       }
-
-                       entry = bch2_journal_add_entry(j, &trans->journal_res,
-                                              BCH_JSET_ENTRY_btree_keys,
-                                              i->btree_id, i->level,
-                                              i->k->k.u64s);
-                       bkey_copy((struct bkey_i *) entry->start, i->k);
-               }
-
-               memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
-                                 btree_trans_journal_entries_start(trans),
-                                 trans->journal_entries.u64s);
-
-               EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s);
-
-               trans->journal_res.offset       += trans->journal_entries.u64s;
-               trans->journal_res.u64s         -= trans->journal_entries.u64s;
-
-               memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res,
-                                               BCH_JSET_ENTRY_write_buffer_keys,
-                                               BTREE_ID_accounting, 0,
-                                               trans->accounting.u64s)->_data,
-                                 btree_trans_subbuf_base(trans, &trans->accounting),
-                                 trans->accounting.u64s);
-
-               if (trans->journal_seq)
-                       *trans->journal_seq = trans->journal_res.seq;
-       }
-
-       trans_for_each_update(trans, i) {
-               struct btree_path *path = trans->paths + i->path;
-
-               if (!i->cached)
-                       bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
-               else if (!i->key_cache_already_flushed)
-                       bch2_btree_insert_key_cached(trans, flags, i);
-               else
-                       bch2_btree_key_cache_drop(trans, path);
-       }
-
-       return 0;
-fatal_err:
-       bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
-       percpu_down_read(&c->mark_lock);
-revert_fs_usage:
-       for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
-            i != accounting;
-            i = bkey_next(i))
-               bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags);
-       percpu_up_read(&c->mark_lock);
-       return ret;
-}
-
-static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
-{
-       /*
-        * Accounting keys aren't deduped in the journal: we have to compare
-        * each individual update against what's in the btree to see if it has
-        * been applied yet, and accounting updates also don't overwrite,
-        * they're deltas that accumulate.
-        */
-       trans_for_each_update(trans, i)
-               if (i->k->k.type != KEY_TYPE_accounting)
-                       bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-}
-
-static int bch2_trans_commit_journal_pin_flush(struct journal *j,
-                               struct journal_entry_pin *_pin, u64 seq)
-{
-       return 0;
-}
-
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
-                                      struct btree_insert_entry **stopped_at,
-                                      unsigned long trace_ip)
-{
-       struct bch_fs *c = trans->c;
-       int ret = 0, u64s_delta = 0;
-
-       for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
-               struct btree_insert_entry *i = trans->updates + idx;
-               if (i->cached)
-                       continue;
-
-               u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-               u64s_delta -= i->old_btree_u64s;
-
-               if (!same_leaf_as_next(trans, i)) {
-                       if (u64s_delta <= 0) {
-                               ret = bch2_foreground_maybe_merge(trans, i->path,
-                                                       i->level, flags);
-                               if (unlikely(ret))
-                                       return ret;
-                       }
-
-                       u64s_delta = 0;
-               }
-       }
-
-       ret = bch2_trans_lock_write(trans);
-       if (unlikely(ret))
-               return ret;
-
-       ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
-
-       if (!ret && unlikely(trans->journal_replay_not_finished))
-               bch2_drop_overwrites_from_journal(trans);
-
-       bch2_trans_unlock_updates_write(trans);
-
-       if (!ret && trans->journal_pin)
-               bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-                                    trans->journal_pin,
-                                    bch2_trans_commit_journal_pin_flush);
-
-       /*
-        * Drop journal reservation after dropping write locks, since dropping
-        * the journal reservation may kick off a journal write:
-        */
-       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
-               bch2_journal_res_put(&c->journal, &trans->journal_res);
-
-       return ret;
-}
-
-static int journal_reclaim_wait_done(struct bch_fs *c)
-{
-       int ret = bch2_journal_error(&c->journal) ?:
-               bch2_btree_key_cache_wait_done(c);
-
-       if (!ret)
-               journal_reclaim_kick(&c->journal);
-       return ret;
-}
-
-static noinline
-int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
-                           struct btree_insert_entry *i,
-                           int ret, unsigned long trace_ip)
-{
-       struct bch_fs *c = trans->c;
-       enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-
-       if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) {
-               /*
-                * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
-                * flag
-                */
-               if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-                   watermark < BCH_WATERMARK_reclaim) {
-                       ret = bch_err_throw(c, journal_reclaim_would_deadlock);
-                       goto out;
-               }
-
-               ret = drop_locks_do(trans,
-                       bch2_trans_journal_res_get(trans,
-                                       (flags & BCH_WATERMARK_MASK)|
-                                       JOURNAL_RES_GET_CHECK));
-               goto out;
-       }
-
-       switch (ret) {
-       case -BCH_ERR_btree_insert_btree_node_full:
-               ret = bch2_btree_split_leaf(trans, i->path, flags);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       trace_and_count(c, trans_restart_btree_node_split, trans,
-                                       trace_ip, trans->paths + i->path);
-               break;
-       case -BCH_ERR_btree_insert_need_mark_replicas:
-               ret = drop_locks_do(trans,
-                       bch2_accounting_update_sb(trans));
-               break;
-       case -BCH_ERR_btree_insert_need_journal_reclaim:
-               bch2_trans_unlock(trans);
-
-               trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
-               track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true);
-
-               wait_event_freezable(c->journal.reclaim_wait,
-                                    (ret = journal_reclaim_wait_done(c)));
-
-               track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false);
-
-               if (ret < 0)
-                       break;
-
-               ret = bch2_trans_relock(trans);
-               break;
-       default:
-               BUG_ON(ret >= 0);
-               break;
-       }
-out:
-       BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
-
-       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
-                               (flags & BCH_TRANS_COMMIT_no_enospc), c,
-               "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
-
-       return ret;
-}
-
-/*
- * This is for updates done in the early part of fsck - btree_gc - before we've
- * gone RW. we only add the new key to the list of keys for journal replay to
- * do.
- */
-static noinline int
-do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-
-       BUG_ON(current != c->recovery_task);
-
-       trans_for_each_update(trans, i) {
-               int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
-               if (ret)
-                       return ret;
-       }
-
-       for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
-            i != btree_trans_journal_entries_top(trans);
-            i = vstruct_next(i)) {
-               if (i->type == BCH_JSET_ENTRY_btree_keys ||
-                   i->type == BCH_JSET_ENTRY_write_buffer_keys) {
-                       jset_entry_for_each_key(i, k) {
-                               int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k);
-                               if (ret)
-                                       return ret;
-                       }
-               }
-
-               if (i->type == BCH_JSET_ENTRY_btree_root) {
-                       guard(mutex)(&c->btree_root_lock);
-
-                       struct btree_root *r = bch2_btree_id_root(c, i->btree_id);
-
-                       bkey_copy(&r->key, i->start);
-                       r->level = i->level;
-                       r->alive = true;
-               }
-       }
-
-       for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
-            i != btree_trans_subbuf_top(trans, &trans->accounting);
-            i = bkey_next(i)) {
-               int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
-{
-       struct btree_insert_entry *errored_at = NULL;
-       struct bch_fs *c = trans->c;
-       unsigned journal_u64s = 0;
-       int ret = 0;
-
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-       ret = trans_maybe_inject_restart(trans, _RET_IP_);
-       if (unlikely(ret))
-               goto out_reset;
-
-       if (!trans->nr_updates &&
-           !trans->journal_entries.u64s &&
-           !trans->accounting.u64s)
-               goto out_reset;
-
-       ret = bch2_trans_commit_run_triggers(trans);
-       if (ret)
-               goto out_reset;
-
-       if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
-           unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) {
-               if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
-                       ret = do_bch2_trans_commit_to_journal_replay(trans);
-               else
-                       ret = bch_err_throw(c, erofs_trans_commit);
-               goto out_reset;
-       }
-
-       EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-
-       journal_u64s = jset_u64s(trans->accounting.u64s);
-       trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-       if (trans->journal_transaction_names)
-               journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
-
-       trans_for_each_update(trans, i) {
-               struct btree_path *path = trans->paths + i->path;
-
-               EBUG_ON(!path->should_be_locked);
-
-               ret = bch2_btree_path_upgrade(trans, path, i->level + 1);
-               if (unlikely(ret))
-                       goto out;
-
-               EBUG_ON(!btree_node_intent_locked(path, i->level));
-
-               if (i->key_cache_already_flushed)
-                       continue;
-
-               if (i->flags & BTREE_UPDATE_nojournal)
-                       continue;
-
-               /* we're going to journal the key being updated: */
-               journal_u64s += jset_u64s(i->k->k.u64s);
-
-               /* and we're also going to log the overwrite: */
-               if (trans->journal_transaction_names)
-                       journal_u64s += jset_u64s(i->old_k.u64s);
-       }
-
-       if (trans->extra_disk_res) {
-               ret = bch2_disk_reservation_add(c, trans->disk_res,
-                               trans->extra_disk_res,
-                               (flags & BCH_TRANS_COMMIT_no_enospc)
-                               ? BCH_DISK_RESERVATION_NOFAIL : 0);
-               if (ret)
-                       goto err;
-       }
-retry:
-       errored_at = NULL;
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
-               memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-       memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-
-       trans->journal_u64s = journal_u64s + trans->journal_entries.u64s;
-
-       ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
-
-       /* make sure we didn't drop or screw up locks: */
-       bch2_trans_verify_locks(trans);
-
-       if (ret)
-               goto err;
-
-       trace_and_count(c, transaction_commit, trans, _RET_IP_);
-out:
-       if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
-               enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans);
-out_reset:
-       if (!ret)
-               bch2_trans_downgrade(trans);
-       bch2_trans_reset_updates(trans);
-
-       return ret;
-err:
-       ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_);
-       if (ret)
-               goto out;
-
-       /*
-        * We might have done another transaction commit in the error path -
-        * i.e. btree write buffer flush - which will have made use of
-        * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
-        * how the journal sequence number to pin is passed in - so we must
-        * restart:
-        */
-       if (flags & BCH_TRANS_COMMIT_no_journal_res) {
-               ret = bch_err_throw(c, transaction_restart_nested);
-               goto out;
-       }
-
-       goto retry;
-}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
deleted file mode 100644 (file)
index 112170f..0000000
+++ /dev/null
@@ -1,937 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_TYPES_H
-#define _BCACHEFS_BTREE_TYPES_H
-
-#include <linux/list.h>
-#include <linux/rhashtable.h>
-
-#include "bbpos_types.h"
-#include "btree_key_cache_types.h"
-#include "buckets_types.h"
-#include "darray.h"
-#include "errcode.h"
-#include "journal_types.h"
-#include "replicas_types.h"
-#include "six.h"
-
-struct open_bucket;
-struct btree_update;
-struct btree_trans;
-
-#define MAX_BSETS              3U
-
-struct btree_nr_keys {
-
-       /*
-        * Amount of live metadata (i.e. size of node after a compaction) in
-        * units of u64s
-        */
-       u16                     live_u64s;
-       u16                     bset_u64s[MAX_BSETS];
-
-       /* live keys only: */
-       u16                     packed_keys;
-       u16                     unpacked_keys;
-};
-
-struct bset_tree {
-       /*
-        * We construct a binary tree in an array as if the array
-        * started at 1, so that things line up on the same cachelines
-        * better: see comments in bset.c at cacheline_to_bkey() for
-        * details
-        */
-
-       /* size of the binary tree and prev array */
-       u16                     size;
-
-       /* function of size - precalculated for to_inorder() */
-       u16                     extra;
-
-       u16                     data_offset;
-       u16                     aux_data_offset;
-       u16                     end_offset;
-};
-
-struct btree_write {
-       struct journal_entry_pin        journal;
-};
-
-struct btree_alloc {
-       struct open_buckets     ob;
-       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
-};
-
-struct btree_bkey_cached_common {
-       struct six_lock         lock;
-       u8                      level;
-       u8                      btree_id;
-       bool                    cached;
-};
-
-struct btree {
-       struct btree_bkey_cached_common c;
-
-       struct rhash_head       hash;
-       u64                     hash_val;
-
-       unsigned long           flags;
-       u16                     written;
-       u8                      nsets;
-       u8                      nr_key_bits;
-       u16                     version_ondisk;
-
-       struct bkey_format      format;
-
-       struct btree_node       *data;
-       void                    *aux_data;
-
-       /*
-        * Sets of sorted keys - the real btree node - plus a binary search tree
-        *
-        * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
-        * to the memory we have allocated for this btree node. Additionally,
-        * set[0]->data points to the entire btree node as it exists on disk.
-        */
-       struct bset_tree        set[MAX_BSETS];
-
-       struct btree_nr_keys    nr;
-       u16                     sib_u64s[2];
-       u16                     whiteout_u64s;
-       u8                      byte_order;
-       u8                      unpack_fn_len;
-
-       struct btree_write      writes[2];
-
-       /* Key/pointer for this btree node */
-       __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-
-       /*
-        * XXX: add a delete sequence number, so when bch2_btree_node_relock()
-        * fails because the lock sequence number has changed - i.e. the
-        * contents were modified - we can still relock the node if it's still
-        * the one we want, without redoing the traversal
-        */
-
-       /*
-        * For asynchronous splits/interior node updates:
-        * When we do a split, we allocate new child nodes and update the parent
-        * node to point to them: we update the parent in memory immediately,
-        * but then we must wait until the children have been written out before
-        * the update to the parent can be written - this is a list of the
-        * btree_updates that are blocking this node from being
-        * written:
-        */
-       struct list_head        write_blocked;
-
-       /*
-        * Also for asynchronous splits/interior node updates:
-        * If a btree node isn't reachable yet, we don't want to kick off
-        * another write - because that write also won't yet be reachable and
-        * marking it as completed before it's reachable would be incorrect:
-        */
-       unsigned long           will_make_reachable;
-
-       struct open_buckets     ob;
-
-       /* lru list */
-       struct list_head        list;
-};
-
-#define BCH_BTREE_CACHE_NOT_FREED_REASONS()    \
-       x(cache_reserve)                        \
-       x(lock_intent)                          \
-       x(lock_write)                           \
-       x(dirty)                                \
-       x(read_in_flight)                       \
-       x(write_in_flight)                      \
-       x(noevict)                              \
-       x(write_blocked)                        \
-       x(will_make_reachable)                  \
-       x(access_bit)
-
-enum bch_btree_cache_not_freed_reasons {
-#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
-       BCH_BTREE_CACHE_NOT_FREED_REASONS()
-#undef x
-       BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
-};
-
-struct btree_cache_list {
-       unsigned                idx;
-       struct shrinker         *shrink;
-       struct list_head        list;
-       size_t                  nr;
-};
-
-struct btree_cache {
-       struct rhashtable       table;
-       bool                    table_init_done;
-       /*
-        * We never free a struct btree, except on shutdown - we just put it on
-        * the btree_cache_freed list and reuse it later. This simplifies the
-        * code, and it doesn't cost us much memory as the memory usage is
-        * dominated by buffers that hold the actual btree node data and those
-        * can be freed - and the number of struct btrees allocated is
-        * effectively bounded.
-        *
-        * btree_cache_freeable effectively is a small cache - we use it because
-        * high order page allocations can be rather expensive, and it's quite
-        * common to delete and allocate btree nodes in quick succession. It
-        * should never grow past ~2-3 nodes in practice.
-        */
-       struct mutex            lock;
-       struct list_head        freeable;
-       struct list_head        freed_pcpu;
-       struct list_head        freed_nonpcpu;
-       struct btree_cache_list live[2];
-
-       size_t                  nr_freeable;
-       size_t                  nr_reserve;
-       size_t                  nr_by_btree[BTREE_ID_NR];
-       atomic_long_t           nr_dirty;
-
-       /* shrinker stats */
-       size_t                  nr_freed;
-       u64                     not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
-
-       /*
-        * If we need to allocate memory for a new btree node and that
-        * allocation fails, we can cannibalize another node in the btree cache
-        * to satisfy the allocation - lock to guarantee only one thread does
-        * this at a time:
-        */
-       struct task_struct      *alloc_lock;
-       struct closure_waitlist alloc_wait;
-
-       struct bbpos            pinned_nodes_start;
-       struct bbpos            pinned_nodes_end;
-       /* btree id mask: 0 for leaves, 1 for interior */
-       u64                     pinned_nodes_mask[2];
-};
-
-struct btree_node_iter {
-       struct btree_node_iter_set {
-               u16     k, end;
-       } data[MAX_BSETS];
-};
-
-#define BTREE_ITER_FLAGS()                     \
-       x(slots)                                \
-       x(intent)                               \
-       x(prefetch)                             \
-       x(is_extents)                           \
-       x(not_extents)                          \
-       x(cached)                               \
-       x(with_key_cache)                       \
-       x(with_updates)                         \
-       x(with_journal)                         \
-       x(snapshot_field)                       \
-       x(all_snapshots)                        \
-       x(filter_snapshots)                     \
-       x(nopreserve)                           \
-       x(cached_nofill)                        \
-       x(key_cache_fill)                       \
-
-#define STR_HASH_FLAGS()                       \
-       x(must_create)                          \
-       x(must_replace)
-
-#define BTREE_UPDATE_FLAGS()                   \
-       x(internal_snapshot_node)               \
-       x(nojournal)                            \
-       x(key_cache_reclaim)
-
-
-/*
- * BTREE_TRIGGER_norun - don't run triggers at all
- *
- * BTREE_TRIGGER_transactional - we're running transactional triggers as part of
- * a transaction commit: triggers may generate new updates
- *
- * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction
- * commit: we have our journal reservation, we're holding btree node write
- * locks, and we know the transaction is going to commit (returning an error
- * here is a fatal error, causing us to go emergency read-only)
- *
- * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage
- *
- * BTREE_TRIGGER_insert - @new is entering the btree
- * BTREE_TRIGGER_overwrite - @old is leaving the btree
- */
-#define BTREE_TRIGGER_FLAGS()                  \
-       x(norun)                                \
-       x(transactional)                        \
-       x(atomic)                               \
-       x(check_repair)                         \
-       x(gc)                                   \
-       x(insert)                               \
-       x(overwrite)                            \
-       x(is_root)
-
-enum {
-#define x(n) BTREE_ITER_FLAG_BIT_##n,
-       BTREE_ITER_FLAGS()
-       STR_HASH_FLAGS()
-       BTREE_UPDATE_FLAGS()
-       BTREE_TRIGGER_FLAGS()
-#undef x
-};
-
-/* iter flags must fit in a u16: */
-//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15);
-
-enum btree_iter_update_trigger_flags {
-#define x(n) BTREE_ITER_##n    = 1U << BTREE_ITER_FLAG_BIT_##n,
-       BTREE_ITER_FLAGS()
-#undef x
-#define x(n) STR_HASH_##n      = 1U << BTREE_ITER_FLAG_BIT_##n,
-       STR_HASH_FLAGS()
-#undef x
-#define x(n) BTREE_UPDATE_##n  = 1U << BTREE_ITER_FLAG_BIT_##n,
-       BTREE_UPDATE_FLAGS()
-#undef x
-#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
-       BTREE_TRIGGER_FLAGS()
-#undef x
-};
-
-enum btree_path_uptodate {
-       BTREE_ITER_UPTODATE             = 0,
-       BTREE_ITER_NEED_RELOCK          = 1,
-       BTREE_ITER_NEED_TRAVERSE        = 2,
-};
-
-#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
-#define TRACK_PATH_ALLOCATED
-#endif
-
-typedef u16 btree_path_idx_t;
-
-struct btree_path {
-       btree_path_idx_t        sorted_idx;
-       u8                      ref;
-       u8                      intent_ref;
-
-       /* btree_iter_copy starts here: */
-       struct bpos             pos;
-
-       enum btree_id           btree_id:5;
-       bool                    cached:1;
-       bool                    preserve:1;
-       enum btree_path_uptodate uptodate:2;
-       /*
-        * When true, failing to relock this path will cause the transaction to
-        * restart:
-        */
-       bool                    should_be_locked:1;
-       unsigned                level:3,
-                               locks_want:3;
-       u8                      nodes_locked;
-
-       struct btree_path_level {
-               struct btree    *b;
-               struct btree_node_iter iter;
-               u32             lock_seq;
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-               u64             lock_taken_time;
-#endif
-       }                       l[BTREE_MAX_DEPTH];
-#ifdef TRACK_PATH_ALLOCATED
-       unsigned long           ip_allocated;
-#endif
-};
-
-static inline struct btree_path_level *path_l(struct btree_path *path)
-{
-       return path->l + path->level;
-}
-
-static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
-{
-#ifdef TRACK_PATH_ALLOCATED
-       return path->ip_allocated;
-#else
-       return _THIS_IP_;
-#endif
-}
-
-/*
- * @pos                        - iterator's current position
- * @level              - current btree depth
- * @locks_want         - btree level below which we start taking intent locks
- * @nodes_locked       - bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked        - bitmask indicating which locks are intent locks
- */
-struct btree_iter {
-       btree_path_idx_t        path;
-       btree_path_idx_t        update_path;
-       btree_path_idx_t        key_cache_path;
-
-       enum btree_id           btree_id:8;
-       u8                      min_depth;
-
-       /* btree_iter_copy starts here: */
-       u16                     flags;
-
-       /* When we're filtering by snapshot, the snapshot ID we're looking for: */
-       unsigned                snapshot;
-
-       struct bpos             pos;
-       /*
-        * Current unpacked key - so that bch2_btree_iter_next()/
-        * bch2_btree_iter_next_slot() can correctly advance pos.
-        */
-       struct bkey             k;
-
-       /* BTREE_ITER_with_journal: */
-       size_t                  journal_idx;
-#ifdef TRACK_PATH_ALLOCATED
-       unsigned long           ip_allocated;
-#endif
-};
-
-#define BKEY_CACHED_ACCESSED           0
-#define BKEY_CACHED_DIRTY              1
-
-struct bkey_cached {
-       struct btree_bkey_cached_common c;
-
-       unsigned long           flags;
-       u16                     u64s;
-       struct bkey_cached_key  key;
-
-       struct rhash_head       hash;
-
-       struct journal_entry_pin journal;
-       u64                     seq;
-
-       struct bkey_i           *k;
-       struct rcu_head         rcu;
-};
-
-static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
-{
-       return !b->cached
-               ? container_of(b, struct btree, c)->key.k.p
-               : container_of(b, struct bkey_cached, c)->key.pos;
-}
-
-struct btree_insert_entry {
-       unsigned                flags;
-       u8                      sort_order;
-       u8                      bkey_type;
-       enum btree_id           btree_id:8;
-       u8                      level:4;
-       bool                    cached:1;
-       bool                    insert_trigger_run:1;
-       bool                    overwrite_trigger_run:1;
-       bool                    key_cache_already_flushed:1;
-       /*
-        * @old_k may be a key from the journal; @old_btree_u64s always refers
-        * to the size of the key being overwritten in the btree:
-        */
-       u8                      old_btree_u64s;
-       btree_path_idx_t        path;
-       struct bkey_i           *k;
-       /* key being overwritten: */
-       struct bkey             old_k;
-       const struct bch_val    *old_v;
-       unsigned long           ip_allocated;
-};
-
-/* Number of btree paths we preallocate, usually enough */
-#define BTREE_ITER_INITIAL             64
-/*
- * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
- * paths should run inside this limit, and if they don't it usually indicates a
- * bug (leaking/duplicated btree paths).
- *
- * exception: some fsck paths
- *
- * bugs with excessive path usage seem to have possibly been eliminated now, so
- * we might consider eliminating this (and btree_trans_too_many_iter()) at some
- * point.
- */
-#define BTREE_ITER_NORMAL_LIMIT                256
-/* never exceed limit */
-#define BTREE_ITER_MAX                 (1U << 10)
-
-struct btree_trans_commit_hook;
-typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
-
-struct btree_trans_commit_hook {
-       btree_trans_commit_hook_fn      *fn;
-       struct btree_trans_commit_hook  *next;
-};
-
-#define BTREE_TRANS_MEM_MAX    (1U << 16)
-
-#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS      10000
-
-struct btree_trans_paths {
-       unsigned long           nr_paths;
-       struct btree_path       paths[];
-};
-
-struct trans_kmalloc_trace {
-       unsigned long           ip;
-       size_t                  bytes;
-};
-typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace;
-
-struct btree_trans_subbuf {
-       u16                     base;
-       u16                     u64s;
-       u16                     size;;
-};
-
-struct btree_trans {
-       struct bch_fs           *c;
-
-       unsigned long           *paths_allocated;
-       struct btree_path       *paths;
-       btree_path_idx_t        *sorted;
-       struct btree_insert_entry *updates;
-
-       void                    *mem;
-       unsigned                mem_top;
-       unsigned                mem_bytes;
-       unsigned                realloc_bytes_required;
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-       darray_trans_kmalloc_trace trans_kmalloc_trace;
-#endif
-
-       btree_path_idx_t        nr_sorted;
-       btree_path_idx_t        nr_paths;
-       btree_path_idx_t        nr_paths_max;
-       btree_path_idx_t        nr_updates;
-       u8                      fn_idx;
-       u8                      lock_must_abort;
-       bool                    lock_may_not_fail:1;
-       bool                    srcu_held:1;
-       bool                    locked:1;
-       bool                    pf_memalloc_nofs:1;
-       bool                    write_locked:1;
-       bool                    used_mempool:1;
-       bool                    in_traverse_all:1;
-       bool                    paths_sorted:1;
-       bool                    memory_allocation_failure:1;
-       bool                    journal_transaction_names:1;
-       bool                    journal_replay_not_finished:1;
-       bool                    notrace_relock_fail:1;
-       enum bch_errcode        restarted:16;
-       u32                     restart_count;
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
-       u32                     restart_count_this_trans;
-#endif
-
-       u64                     last_begin_time;
-       unsigned long           last_begin_ip;
-       unsigned long           last_restarted_ip;
-#ifdef CONFIG_BCACHEFS_DEBUG
-       bch_stacktrace          last_restarted_trace;
-#endif
-       unsigned long           last_unlock_ip;
-       unsigned long           srcu_lock_time;
-
-       const char              *fn;
-       struct btree_bkey_cached_common *locking;
-       struct six_lock_waiter  locking_wait;
-       int                     srcu_idx;
-
-       /* update path: */
-       struct btree_trans_subbuf journal_entries;
-       struct btree_trans_subbuf accounting;
-
-       struct btree_trans_commit_hook *hooks;
-       struct journal_entry_pin *journal_pin;
-
-       struct journal_res      journal_res;
-       u64                     *journal_seq;
-       struct disk_reservation *disk_res;
-
-       struct bch_fs_usage_base fs_usage_delta;
-
-       unsigned                journal_u64s;
-       unsigned                extra_disk_res; /* XXX kill */
-
-       __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lockdep_map      dep_map;
-#endif
-       /* Entries before this are zeroed out on every bch2_trans_get() call */
-
-       struct list_head        list;
-       struct closure          ref;
-
-       unsigned long           _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)];
-       struct btree_trans_paths trans_paths;
-       struct btree_path       _paths[BTREE_ITER_INITIAL];
-       btree_path_idx_t        _sorted[BTREE_ITER_INITIAL + 4];
-       struct btree_insert_entry _updates[BTREE_ITER_INITIAL];
-};
-
-static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter)
-{
-       return trans->paths + iter->path;
-}
-
-static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter)
-{
-       return iter->key_cache_path
-               ? trans->paths + iter->key_cache_path
-               : NULL;
-}
-
-#define BCH_BTREE_WRITE_TYPES()                                                \
-       x(initial,              0)                                      \
-       x(init_next_bset,       1)                                      \
-       x(cache_reclaim,        2)                                      \
-       x(journal_reclaim,      3)                                      \
-       x(interior,             4)
-
-enum btree_write_type {
-#define x(t, n) BTREE_WRITE_##t,
-       BCH_BTREE_WRITE_TYPES()
-#undef x
-       BTREE_WRITE_TYPE_NR,
-};
-
-#define BTREE_WRITE_TYPE_MASK  (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
-#define BTREE_WRITE_TYPE_BITS  ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
-
-#define BTREE_FLAGS()                                                  \
-       x(read_in_flight)                                               \
-       x(read_error)                                                   \
-       x(dirty)                                                        \
-       x(need_write)                                                   \
-       x(write_blocked)                                                \
-       x(will_make_reachable)                                          \
-       x(noevict)                                                      \
-       x(write_idx)                                                    \
-       x(accessed)                                                     \
-       x(write_in_flight)                                              \
-       x(write_in_flight_inner)                                        \
-       x(just_written)                                                 \
-       x(dying)                                                        \
-       x(fake)                                                         \
-       x(need_rewrite)                                                 \
-       x(need_rewrite_error)                                           \
-       x(need_rewrite_degraded)                                        \
-       x(need_rewrite_ptr_written_zero)                                \
-       x(never_write)                                                  \
-       x(pinned)
-
-enum btree_flags {
-       /* First bits for btree node write type */
-       BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
-#define x(flag)        BTREE_NODE_##flag,
-       BTREE_FLAGS()
-#undef x
-};
-
-#define x(flag)                                                                \
-static inline bool btree_node_ ## flag(struct btree *b)                        \
-{      return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
-                                                                       \
-static inline void set_btree_node_ ## flag(struct btree *b)            \
-{      set_bit(BTREE_NODE_ ## flag, &b->flags); }                      \
-                                                                       \
-static inline void clear_btree_node_ ## flag(struct btree *b)          \
-{      clear_bit(BTREE_NODE_ ## flag, &b->flags); }
-
-BTREE_FLAGS()
-#undef x
-
-#define BTREE_NODE_REWRITE_REASON()                                    \
-       x(none)                                                         \
-       x(unknown)                                                      \
-       x(error)                                                        \
-       x(degraded)                                                     \
-       x(ptr_written_zero)
-
-enum btree_node_rewrite_reason {
-#define x(n)   BTREE_NODE_REWRITE_##n,
-       BTREE_NODE_REWRITE_REASON()
-#undef x
-};
-
-static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b)
-{
-       if (btree_node_need_rewrite_ptr_written_zero(b))
-               return BTREE_NODE_REWRITE_ptr_written_zero;
-       if (btree_node_need_rewrite_degraded(b))
-               return BTREE_NODE_REWRITE_degraded;
-       if (btree_node_need_rewrite_error(b))
-               return BTREE_NODE_REWRITE_error;
-       if (btree_node_need_rewrite(b))
-               return BTREE_NODE_REWRITE_unknown;
-       return BTREE_NODE_REWRITE_none;
-}
-
-static inline struct btree_write *btree_current_write(struct btree *b)
-{
-       return b->writes + btree_node_write_idx(b);
-}
-
-static inline struct btree_write *btree_prev_write(struct btree *b)
-{
-       return b->writes + (btree_node_write_idx(b) ^ 1);
-}
-
-static inline struct bset_tree *bset_tree_last(struct btree *b)
-{
-       EBUG_ON(!b->nsets);
-       return b->set + b->nsets - 1;
-}
-
-static inline void *
-__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
-{
-       return (void *) ((u64 *) b->data + offset);
-}
-
-static inline u16
-__btree_node_ptr_to_offset(const struct btree *b, const void *p)
-{
-       u16 ret = (u64 *) p - (u64 *) b->data;
-
-       EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
-       return ret;
-}
-
-static inline struct bset *bset(const struct btree *b,
-                               const struct bset_tree *t)
-{
-       return __btree_node_offset_to_ptr(b, t->data_offset);
-}
-
-static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
-{
-       t->end_offset =
-               __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
-}
-
-static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
-                                 const struct bset *i)
-{
-       t->data_offset = __btree_node_ptr_to_offset(b, i);
-       set_btree_bset_end(b, t);
-}
-
-static inline struct bset *btree_bset_first(struct btree *b)
-{
-       return bset(b, b->set);
-}
-
-static inline struct bset *btree_bset_last(struct btree *b)
-{
-       return bset(b, bset_tree_last(b));
-}
-
-static inline u16
-__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
-{
-       return __btree_node_ptr_to_offset(b, k);
-}
-
-static inline struct bkey_packed *
-__btree_node_offset_to_key(const struct btree *b, u16 k)
-{
-       return __btree_node_offset_to_ptr(b, k);
-}
-
-static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
-{
-       return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
-}
-
-#define btree_bkey_first(_b, _t)                                       \
-({                                                                     \
-       EBUG_ON(bset(_b, _t)->start !=                                  \
-               __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
-                                                                       \
-       bset(_b, _t)->start;                                            \
-})
-
-#define btree_bkey_last(_b, _t)                                                \
-({                                                                     \
-       EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=     \
-               vstruct_last(bset(_b, _t)));                            \
-                                                                       \
-       __btree_node_offset_to_key(_b, (_t)->end_offset);               \
-})
-
-static inline unsigned bset_u64s(struct bset_tree *t)
-{
-       return t->end_offset - t->data_offset -
-               sizeof(struct bset) / sizeof(u64);
-}
-
-static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
-{
-       return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
-}
-
-static inline unsigned bset_byte_offset(struct btree *b, void *i)
-{
-       return i - (void *) b->data;
-}
-
-enum btree_node_type {
-       BKEY_TYPE_btree,
-#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
-       BCH_BTREE_IDS()
-#undef x
-       BKEY_TYPE_NR
-};
-
-/* Type of a key in btree @id at level @level: */
-static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
-{
-       return level ? BKEY_TYPE_btree : (unsigned) id + 1;
-}
-
-/* Type of keys @b contains: */
-static inline enum btree_node_type btree_node_type(struct btree *b)
-{
-       return __btree_node_type(b->c.level, b->c.btree_id);
-}
-
-const char *bch2_btree_node_type_str(enum btree_node_type);
-
-#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS             \
-       (BIT_ULL(BKEY_TYPE_extents)|                    \
-        BIT_ULL(BKEY_TYPE_alloc)|                      \
-        BIT_ULL(BKEY_TYPE_inodes)|                     \
-        BIT_ULL(BKEY_TYPE_stripes)|                    \
-        BIT_ULL(BKEY_TYPE_reflink)|                    \
-        BIT_ULL(BKEY_TYPE_subvolumes)|                 \
-        BIT_ULL(BKEY_TYPE_btree))
-
-#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS            \
-       (BIT_ULL(BKEY_TYPE_alloc)|                      \
-        BIT_ULL(BKEY_TYPE_inodes)|                     \
-        BIT_ULL(BKEY_TYPE_stripes)|                    \
-        BIT_ULL(BKEY_TYPE_snapshots))
-
-#define BTREE_NODE_TYPE_HAS_TRIGGERS                   \
-       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
-        BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
-
-static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type)
-{
-       return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS;
-}
-
-static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type)
-{
-       return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS;
-}
-
-static inline bool btree_node_type_has_triggers(enum btree_node_type type)
-{
-       return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
-}
-
-static inline bool btree_id_is_extents(enum btree_id btree)
-{
-       const u64 mask = 0
-#define x(name, nr, flags, ...)        |((!!((flags) & BTREE_IS_extents)) << nr)
-       BCH_BTREE_IDS()
-#undef x
-       ;
-
-       return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
-       return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1);
-}
-
-static inline bool btree_type_has_snapshots(enum btree_id btree)
-{
-       const u64 mask = 0
-#define x(name, nr, flags, ...)        |((!!((flags) & BTREE_IS_snapshots)) << nr)
-       BCH_BTREE_IDS()
-#undef x
-       ;
-
-       return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_has_snapshot_field(enum btree_id btree)
-{
-       const u64 mask = 0
-#define x(name, nr, flags, ...)        |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr)
-       BCH_BTREE_IDS()
-#undef x
-       ;
-
-       return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_has_ptrs(enum btree_id btree)
-{
-       const u64 mask = 0
-#define x(name, nr, flags, ...)        |((!!((flags) & BTREE_IS_data)) << nr)
-       BCH_BTREE_IDS()
-#undef x
-       ;
-
-       return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_uses_write_buffer(enum btree_id btree)
-{
-       const u64 mask = 0
-#define x(name, nr, flags, ...)        |((!!((flags) & BTREE_IS_write_buffer)) << nr)
-       BCH_BTREE_IDS()
-#undef x
-       ;
-
-       return BIT_ULL(btree) & mask;
-}
-
-static inline u8 btree_trigger_order(enum btree_id btree)
-{
-       switch (btree) {
-       case BTREE_ID_alloc:
-               return U8_MAX;
-       case BTREE_ID_stripes:
-               return U8_MAX - 1;
-       default:
-               return btree;
-       }
-}
-
-struct btree_root {
-       struct btree            *b;
-
-       /* On disk root - see async splits: */
-       __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-       u8                      level;
-       u8                      alive;
-       s16                     error;
-};
-
-enum btree_gc_coalesce_fail_reason {
-       BTREE_GC_COALESCE_FAIL_RESERVE_GET,
-       BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
-       BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
-};
-
-enum btree_node_sibling {
-       btree_prev_sib,
-       btree_next_sib,
-};
-
-struct get_locks_fail {
-       unsigned        l;
-       struct btree    *b;
-};
-
-#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
deleted file mode 100644 (file)
index ee657b9..0000000
+++ /dev/null
@@ -1,916 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "extents.h"
-#include "keylist.h"
-#include "snapshot.h"
-#include "trace.h"
-
-#include <linux/string_helpers.h>
-
-static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
-                                        const struct btree_insert_entry *r)
-{
-       return   cmp_int(l->sort_order, r->sort_order) ?:
-                cmp_int(l->cached,     r->cached) ?:
-                -cmp_int(l->level,     r->level) ?:
-                bpos_cmp(l->k->k.p,    r->k->k.p);
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
-                         struct bkey_i *, enum btree_iter_update_trigger_flags,
-                         unsigned long ip);
-
-static noinline int extent_front_merge(struct btree_trans *trans,
-                                      struct btree_iter *iter,
-                                      struct bkey_s_c k,
-                                      struct bkey_i **insert,
-                                      enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_i *update;
-       int ret;
-
-       if (unlikely(trans->journal_replay_not_finished))
-               return 0;
-
-       update = bch2_bkey_make_mut_noupdate(trans, k);
-       ret = PTR_ERR_OR_ZERO(update);
-       if (ret)
-               return ret;
-
-       if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
-               return 0;
-
-       ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
-               bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
-       if (ret < 0)
-               return ret;
-       if (ret)
-               return 0;
-
-       ret = bch2_btree_delete_at(trans, iter, flags);
-       if (ret)
-               return ret;
-
-       *insert = update;
-       return 0;
-}
-
-static noinline int extent_back_merge(struct btree_trans *trans,
-                                     struct btree_iter *iter,
-                                     struct bkey_i *insert,
-                                     struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       int ret;
-
-       if (unlikely(trans->journal_replay_not_finished))
-               return 0;
-
-       ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
-               bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
-       if (ret < 0)
-               return ret;
-       if (ret)
-               return 0;
-
-       bch2_bkey_merge(c, bkey_i_to_s(insert), k);
-       return 0;
-}
-
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
-                                     enum btree_id btree_id, struct bpos pos)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u32 snapshot = pos.snapshot;
-       int ret;
-
-       if (!bch2_snapshot_parent(trans->c, pos.snapshot))
-               return 0;
-
-       pos.snapshot++;
-
-       for_each_btree_key_norestart(trans, iter, btree_id, pos,
-                          BTREE_ITER_all_snapshots|
-                          BTREE_ITER_nopreserve, k, ret) {
-               if (!bkey_eq(k.k->p, pos))
-                       break;
-
-               if (bch2_snapshot_is_ancestor(trans->c, snapshot,
-                                             k.k->p.snapshot)) {
-                       ret = !bkey_whiteout(k.k);
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-                                    enum btree_id btree, struct bpos pos,
-                                    snapshot_id_list *s)
-{
-       int ret = 0;
-
-       darray_for_each(*s, id) {
-               pos.snapshot = *id;
-
-               struct btree_iter iter;
-               struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos,
-                                                      BTREE_ITER_not_extents|
-                                                      BTREE_ITER_intent);
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-
-               if (k.k->type == KEY_TYPE_deleted) {
-                       struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-                       ret = PTR_ERR_OR_ZERO(update);
-                       if (ret) {
-                               bch2_trans_iter_exit(trans, &iter);
-                               break;
-                       }
-
-                       bkey_init(&update->k);
-                       update->k.p             = pos;
-                       update->k.type          = KEY_TYPE_whiteout;
-
-                       ret = bch2_trans_update(trans, &iter, update,
-                                               BTREE_UPDATE_internal_snapshot_node);
-               }
-               bch2_trans_iter_exit(trans, &iter);
-
-               if (ret)
-                       break;
-       }
-
-       darray_exit(s);
-       return ret;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
-                                      struct btree_iter *iter,
-                                      enum btree_iter_update_trigger_flags flags,
-                                      struct bkey_s_c old,
-                                      struct bkey_s_c new)
-{
-       enum btree_id btree_id = iter->btree_id;
-       struct bkey_i *update;
-       struct bpos new_start = bkey_start_pos(new.k);
-       unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start);
-       unsigned back_split  = bkey_gt(old.k->p, new.k->p);
-       unsigned middle_split = (front_split || back_split) &&
-               old.k->p.snapshot != new.k->p.snapshot;
-       unsigned nr_splits = front_split + back_split + middle_split;
-       int ret = 0, compressed_sectors;
-
-       /*
-        * If we're going to be splitting a compressed extent, note it
-        * so that __bch2_trans_commit() can increase our disk
-        * reservation:
-        */
-       if (nr_splits > 1 &&
-           (compressed_sectors = bch2_bkey_sectors_compressed(old)))
-               trans->extra_disk_res += compressed_sectors * (nr_splits - 1);
-
-       if (front_split) {
-               update = bch2_bkey_make_mut_noupdate(trans, old);
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bch2_cut_back(new_start, update);
-
-               ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-                                       old.k->p, update->k.p) ?:
-                       bch2_btree_insert_nonextent(trans, btree_id, update,
-                                       BTREE_UPDATE_internal_snapshot_node|flags);
-               if (ret)
-                       return ret;
-       }
-
-       /* If we're overwriting in a different snapshot - middle split: */
-       if (middle_split) {
-               update = bch2_bkey_make_mut_noupdate(trans, old);
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bch2_cut_front(new_start, update);
-               bch2_cut_back(new.k->p, update);
-
-               ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-                                       old.k->p, update->k.p) ?:
-                       bch2_btree_insert_nonextent(trans, btree_id, update,
-                                         BTREE_UPDATE_internal_snapshot_node|flags);
-               if (ret)
-                       return ret;
-       }
-
-       if (bkey_le(old.k->p, new.k->p)) {
-               update = bch2_trans_kmalloc(trans, sizeof(*update));
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bkey_init(&update->k);
-               update->k.p = old.k->p;
-               update->k.p.snapshot = new.k->p.snapshot;
-
-               if (new.k->p.snapshot != old.k->p.snapshot) {
-                       update->k.type = KEY_TYPE_whiteout;
-               } else if (btree_type_has_snapshots(btree_id)) {
-                       ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
-                       if (ret < 0)
-                               return ret;
-                       if (ret)
-                               update->k.type = KEY_TYPE_whiteout;
-               }
-
-               ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-                                         BTREE_UPDATE_internal_snapshot_node|flags);
-               if (ret)
-                       return ret;
-       }
-
-       if (back_split) {
-               update = bch2_bkey_make_mut_noupdate(trans, old);
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bch2_cut_front(new.k->p, update);
-
-               ret = bch2_trans_update_by_path(trans, iter->path, update,
-                                         BTREE_UPDATE_internal_snapshot_node|
-                                         flags, _RET_IP_);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static int bch2_trans_update_extent(struct btree_trans *trans,
-                                   struct btree_iter *orig_iter,
-                                   struct bkey_i *insert,
-                                   enum btree_iter_update_trigger_flags flags)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       enum btree_id btree_id = orig_iter->btree_id;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
-                            BTREE_ITER_intent|
-                            BTREE_ITER_with_updates|
-                            BTREE_ITER_not_extents);
-       k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
-       if ((ret = bkey_err(k)))
-               goto err;
-       if (!k.k)
-               goto out;
-
-       if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
-               if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
-                       ret = extent_front_merge(trans, &iter, k, &insert, flags);
-                       if (ret)
-                               goto err;
-               }
-
-               goto next;
-       }
-
-       while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
-               bool done = bkey_lt(insert->k.p, k.k->p);
-
-               ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
-               if (ret)
-                       goto err;
-
-               if (done)
-                       goto out;
-next:
-               bch2_btree_iter_advance(trans, &iter);
-               k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
-               if ((ret = bkey_err(k)))
-                       goto err;
-               if (!k.k)
-                       goto out;
-       }
-
-       if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
-               ret = extent_back_merge(trans, &iter, insert, k);
-               if (ret)
-                       goto err;
-       }
-out:
-       if (!bkey_deleted(&insert->k))
-               ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-static noinline int flush_new_cached_update(struct btree_trans *trans,
-                                           struct btree_insert_entry *i,
-                                           enum btree_iter_update_trigger_flags flags,
-                                           unsigned long ip)
-{
-       struct bkey k;
-       int ret;
-
-       btree_path_idx_t path_idx =
-               bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
-                             BTREE_ITER_intent, _THIS_IP_);
-       ret = bch2_btree_path_traverse(trans, path_idx, 0);
-       if (ret)
-               goto out;
-
-       struct btree_path *btree_path = trans->paths + path_idx;
-
-       /*
-        * The old key in the insert entry might actually refer to an existing
-        * key in the btree that has been deleted from cache and not yet
-        * flushed. Check for this and skip the flush so we don't run triggers
-        * against a stale key.
-        */
-       bch2_btree_path_peek_slot_exact(btree_path, &k);
-       if (!bkey_deleted(&k))
-               goto out;
-
-       i->key_cache_already_flushed = true;
-       i->flags |= BTREE_TRIGGER_norun;
-
-       btree_path_set_should_be_locked(trans, btree_path);
-       ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
-out:
-       bch2_path_put(trans, path_idx, true);
-       return ret;
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
-                         struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
-                         unsigned long ip)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_insert_entry *i, n;
-       int cmp;
-
-       struct btree_path *path = trans->paths + path_idx;
-       EBUG_ON(!path->should_be_locked);
-       EBUG_ON(trans->nr_updates >= trans->nr_paths);
-       EBUG_ON(!bpos_eq(k->k.p, path->pos));
-
-       n = (struct btree_insert_entry) {
-               .flags          = flags,
-               .sort_order     = btree_trigger_order(path->btree_id),
-               .bkey_type      = __btree_node_type(path->level, path->btree_id),
-               .btree_id       = path->btree_id,
-               .level          = path->level,
-               .cached         = path->cached,
-               .path           = path_idx,
-               .k              = k,
-               .ip_allocated   = ip,
-       };
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans_for_each_update(trans, i)
-               BUG_ON(i != trans->updates &&
-                      btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
-
-       /*
-        * Pending updates are kept sorted: first, find position of new update,
-        * then delete/trim any updates the new update overwrites:
-        */
-       for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) {
-               cmp = btree_insert_entry_cmp(&n, i);
-               if (cmp <= 0)
-                       break;
-       }
-
-       bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
-
-       if (overwrite) {
-               EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-
-               bch2_path_put(trans, i->path, true);
-               i->flags        = n.flags;
-               i->cached       = n.cached;
-               i->k            = n.k;
-               i->path         = n.path;
-               i->ip_allocated = n.ip_allocated;
-       } else {
-               array_insert_item(trans->updates, trans->nr_updates,
-                                 i - trans->updates, n);
-
-               i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
-               i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
-
-               if (unlikely(trans->journal_replay_not_finished)) {
-                       struct bkey_i *j_k =
-                               bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
-
-                       if (j_k) {
-                               i->old_k = j_k->k;
-                               i->old_v = &j_k->v;
-                       }
-               }
-       }
-
-       __btree_path_get(trans, trans->paths + i->path, true);
-
-       trace_update_by_path(trans, path, i, overwrite);
-
-       /*
-        * If a key is present in the key cache, it must also exist in the
-        * btree - this is necessary for cache coherency. When iterating over
-        * a btree that's cached in the key cache, the btree iter code checks
-        * the key cache - but the key has to exist in the btree for that to
-        * work:
-        */
-       if (path->cached && !i->old_btree_u64s)
-               return flush_new_cached_update(trans, i, flags, ip);
-
-       return 0;
-}
-
-static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
-                                                   struct btree_iter *iter,
-                                                   struct btree_path *path)
-{
-       struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter);
-
-       if (!key_cache_path ||
-           !key_cache_path->should_be_locked ||
-           !bpos_eq(key_cache_path->pos, iter->pos)) {
-               struct bkey_cached *ck;
-               int ret;
-
-               if (!iter->key_cache_path)
-                       iter->key_cache_path =
-                               bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-                                             BTREE_ITER_intent|
-                                             BTREE_ITER_cached, _THIS_IP_);
-
-               iter->key_cache_path =
-                       bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-                                               iter->flags & BTREE_ITER_intent,
-                                               _THIS_IP_);
-
-               ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached);
-               if (unlikely(ret))
-                       return ret;
-
-               ck = (void *) trans->paths[iter->key_cache_path].l[0].b;
-
-               if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-                       trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
-                       return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
-               }
-
-               btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
-       }
-
-       return 0;
-}
-
-int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter,
-                                     struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
-                                     unsigned long ip)
-{
-       kmsan_check_memory(k, bkey_bytes(&k->k));
-
-       btree_path_idx_t path_idx = iter->update_path ?: iter->path;
-       int ret;
-
-       if (iter->flags & BTREE_ITER_is_extents)
-               return bch2_trans_update_extent(trans, iter, k, flags);
-
-       if (bkey_deleted(&k->k) &&
-           !(flags & BTREE_UPDATE_key_cache_reclaim) &&
-           (iter->flags & BTREE_ITER_filter_snapshots)) {
-               ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
-               if (unlikely(ret < 0))
-                       return ret;
-
-               if (ret)
-                       k->k.type = KEY_TYPE_whiteout;
-       }
-
-       /*
-        * Ensure that updates to cached btrees go to the key cache:
-        */
-       struct btree_path *path = trans->paths + path_idx;
-       if (!(flags & BTREE_UPDATE_key_cache_reclaim) &&
-           !path->cached &&
-           !path->level &&
-           btree_id_cached(trans->c, path->btree_id)) {
-               ret = bch2_trans_update_get_key_cache(trans, iter, path);
-               if (ret)
-                       return ret;
-
-               path_idx = iter->key_cache_path;
-       }
-
-       return bch2_trans_update_by_path(trans, path_idx, k, flags, ip);
-}
-
-int bch2_btree_insert_clone_trans(struct btree_trans *trans,
-                                 enum btree_id btree,
-                                 struct bkey_i *k)
-{
-       struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
-       int ret = PTR_ERR_OR_ZERO(n);
-       if (ret)
-               return ret;
-
-       bkey_copy(n, k);
-       return bch2_btree_insert_trans(trans, btree, n, 0);
-}
-
-void *__bch2_trans_subbuf_alloc(struct btree_trans *trans,
-                               struct btree_trans_subbuf *buf,
-                               unsigned u64s)
-{
-       unsigned new_top = buf->u64s + u64s;
-       unsigned new_size = buf->size;
-
-       BUG_ON(roundup_pow_of_two(new_top) > U16_MAX);
-
-       if (new_top > new_size)
-               new_size = roundup_pow_of_two(new_top);
-
-       void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64));
-       if (IS_ERR(n))
-               return n;
-
-       unsigned offset = (u64 *) n - (u64 *) trans->mem;
-       BUG_ON(offset > U16_MAX);
-
-       if (buf->u64s)
-               memcpy(n,
-                      btree_trans_subbuf_base(trans, buf),
-                      buf->size * sizeof(u64));
-       buf->base = (u64 *) n - (u64 *) trans->mem;
-       buf->size = new_size;
-
-       void *p = btree_trans_subbuf_top(trans, buf);
-       buf->u64s = new_top;
-       return p;
-}
-
-int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
-                            enum btree_id btree, struct bpos end)
-{
-       bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
-       struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter);
-       int ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       bch2_btree_iter_advance(trans, iter);
-       k = bch2_btree_iter_peek_slot(trans, iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       BUG_ON(k.k->type != KEY_TYPE_deleted);
-
-       if (bkey_gt(k.k->p, end)) {
-               ret = bch_err_throw(trans->c, ENOSPC_btree_slot);
-               goto err;
-       }
-
-       return 0;
-err:
-       bch2_trans_iter_exit(trans, iter);
-       return ret;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *trans,
-                           struct btree_trans_commit_hook *h)
-{
-       h->next = trans->hooks;
-       trans->hooks = h;
-}
-
-int bch2_btree_insert_nonextent(struct btree_trans *trans,
-                               enum btree_id btree, struct bkey_i *k,
-                               enum btree_iter_update_trigger_flags flags)
-{
-       struct btree_iter iter;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, btree, k->k.p,
-                            BTREE_ITER_cached|
-                            BTREE_ITER_not_extents|
-                            BTREE_ITER_intent);
-       ret   = bch2_btree_iter_traverse(trans, &iter) ?:
-               bch2_trans_update(trans, &iter, k, flags);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
-                           struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
-{
-       struct btree_iter iter;
-       bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
-                            BTREE_ITER_intent|flags);
-       int ret = bch2_btree_iter_traverse(trans, &iter) ?:
-                 bch2_trans_update(trans, &iter, k, flags);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/**
- * bch2_btree_insert - insert keys into the extent btree
- * @c:                 pointer to struct bch_fs
- * @id:                        btree to insert into
- * @k:                 key to insert
- * @disk_res:          must be non-NULL whenever inserting or potentially
- *                     splitting data extents
- * @flags:             transaction commit flags
- * @iter_flags:                btree iter update trigger flags
- *
- * Returns:            0 on success, error code on failure
- */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
-                     struct disk_reservation *disk_res, int flags,
-                     enum btree_iter_update_trigger_flags iter_flags)
-{
-       return bch2_trans_commit_do(c, disk_res, NULL, flags,
-                            bch2_btree_insert_trans(trans, id, k, iter_flags));
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
-                        struct btree_iter *iter, unsigned update_flags)
-{
-       struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
-       int ret = PTR_ERR_OR_ZERO(k);
-       if (ret)
-               return ret;
-
-       bkey_init(&k->k);
-       k->k.p = iter->pos;
-       return bch2_trans_update(trans, iter, k, update_flags);
-}
-
-int bch2_btree_delete(struct btree_trans *trans,
-                     enum btree_id btree, struct bpos pos,
-                     unsigned update_flags)
-{
-       struct btree_iter iter;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, btree, pos,
-                            BTREE_ITER_cached|
-                            BTREE_ITER_intent);
-       ret   = bch2_btree_iter_traverse(trans, &iter) ?:
-               bch2_btree_delete_at(trans, &iter, update_flags);
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
-                                 struct bpos start, struct bpos end,
-                                 unsigned update_flags,
-                                 u64 *journal_seq)
-{
-       u32 restart_count = trans->restart_count;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
-       while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) {
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(trans->c, 0);
-               struct bkey_i delete;
-
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               bkey_init(&delete.k);
-
-               /*
-                * This could probably be more efficient for extents:
-                */
-
-               /*
-                * For extents, iter.pos won't necessarily be the same as
-                * bkey_start_pos(k.k) (for non extents they always will be the
-                * same). It's important that we delete starting from iter.pos
-                * because the range we want to delete could start in the middle
-                * of k.
-                *
-                * (bch2_btree_iter_peek() does guarantee that iter.pos >=
-                * bkey_start_pos(k.k)).
-                */
-               delete.k.p = iter.pos;
-
-               if (iter.flags & BTREE_ITER_is_extents)
-                       bch2_key_resize(&delete.k,
-                                       bpos_min(end, k.k->p).offset -
-                                       iter.pos.offset);
-
-               ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
-                       bch2_trans_commit(trans, &disk_res, journal_seq,
-                                         BCH_TRANS_COMMIT_no_enospc);
-               bch2_disk_reservation_put(trans->c, &disk_res);
-err:
-               /*
-                * the bch2_trans_begin() call is in a weird place because we
-                * need to call it after every transaction commit, to avoid path
-                * overflow, but don't want to call it if the delete operation
-                * is a no-op and we have no work to do:
-                */
-               bch2_trans_begin(trans);
-
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       ret = 0;
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-                           struct bpos start, struct bpos end,
-                           unsigned update_flags,
-                           u64 *journal_seq)
-{
-       int ret = bch2_trans_run(c,
-                       bch2_btree_delete_range_trans(trans, id, start, end,
-                                                     update_flags, journal_seq));
-       if (ret == -BCH_ERR_transaction_restart_nested)
-               ret = 0;
-       return ret;
-}
-
-int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set)
-{
-       struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
-       int ret = PTR_ERR_OR_ZERO(k);
-       if (ret)
-               return ret;
-
-       bkey_init(&k->k);
-       k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-       k->k.p = iter->pos;
-       if (iter->flags & BTREE_ITER_is_extents)
-               bch2_key_resize(&k->k, 1);
-
-       return bch2_trans_update(trans, iter, k, 0);
-}
-
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
-                      struct bpos pos, bool set)
-{
-       struct btree_iter iter;
-       bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
-
-       int ret = bch2_btree_iter_traverse(trans, &iter) ?:
-                 bch2_btree_bit_mod_iter(trans, &iter, set);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
-                               struct bpos pos, bool set)
-{
-       struct bkey_i k;
-
-       bkey_init(&k.k);
-       k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-       k.k.p = pos;
-
-       return bch2_trans_update_buffered(trans, btree, &k);
-}
-
-static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len)
-{
-       unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
-
-       struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
-       int ret = PTR_ERR_OR_ZERO(e);
-       if (ret)
-               return ret;
-
-       struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
-       journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
-       memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0);
-       return 0;
-}
-
-int bch2_trans_log_str(struct btree_trans *trans, const char *str)
-{
-       return __bch2_trans_log_str(trans, str, strlen(str));
-}
-
-int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
-{
-       int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
-       if (ret)
-               return ret;
-
-       return __bch2_trans_log_str(trans, buf->buf, buf->pos);
-}
-
-int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree,
-                       unsigned level, struct bkey_i *k)
-{
-       struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
-       int ret = PTR_ERR_OR_ZERO(e);
-       if (ret)
-               return ret;
-
-       journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s);
-       bkey_copy(e->start, k);
-       return 0;
-}
-
-__printf(3, 0)
-static int
-__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
-                 va_list args)
-{
-       struct printbuf buf = PRINTBUF;
-       prt_vprintf(&buf, fmt, args);
-
-       unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-
-       int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
-       if (ret)
-               goto err;
-
-       if (!test_bit(JOURNAL_running, &c->journal.flags)) {
-               ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
-               if (ret)
-                       goto err;
-
-               struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
-               journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
-               memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0);
-               c->journal.early_journal_entries.nr += jset_u64s(u64s);
-       } else {
-               ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
-                       bch2_trans_log_msg(trans, &buf));
-       }
-err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-__printf(2, 3)
-int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-       va_list args;
-       int ret;
-
-       va_start(args, fmt);
-       ret = __bch2_fs_log_msg(c, 0, fmt, args);
-       va_end(args);
-       return ret;
-}
-
-/*
- * Use for logging messages during recovery to enable reserved space and avoid
- * blocking.
- */
-__printf(2, 3)
-int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-       va_list args;
-       int ret;
-
-       va_start(args, fmt);
-       ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
-       va_end(args);
-       return ret;
-}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
deleted file mode 100644 (file)
index 0b98ab9..0000000
+++ /dev/null
@@ -1,429 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_UPDATE_H
-#define _BCACHEFS_BTREE_UPDATE_H
-
-#include "btree_iter.h"
-#include "journal.h"
-#include "snapshot.h"
-
-struct bch_fs;
-struct btree;
-
-void bch2_btree_node_prep_for_write(struct btree_trans *,
-                                   struct btree_path *, struct btree *);
-bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
-                               struct btree *, struct btree_node_iter *,
-                               struct bkey_i *);
-
-int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
-int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
-void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
-
-void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
-                               struct bkey_i *, u64);
-
-#define BCH_TRANS_COMMIT_FLAGS()                                                       \
-       x(no_enospc,    "don't check for enospc")                                       \
-       x(no_check_rw,  "don't attempt to take a ref on c->writes")                     \
-       x(no_journal_res, "don't take a journal reservation, instead "                  \
-                       "pin journal entry referred to by trans->journal_res.seq")      \
-       x(journal_reclaim, "operation required for journal reclaim; may return error"   \
-                       "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
-       x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied")
-
-enum __bch_trans_commit_flags {
-       /* First bits for bch_watermark: */
-       __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
-#define x(n, ...)      __BCH_TRANS_COMMIT_##n,
-       BCH_TRANS_COMMIT_FLAGS()
-#undef x
-};
-
-enum bch_trans_commit_flags {
-#define x(n, ...)      BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
-       BCH_TRANS_COMMIT_FLAGS()
-#undef x
-};
-
-void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
-
-int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
-int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
-
-int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
-                               struct bkey_i *, enum btree_iter_update_trigger_flags);
-
-int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
-                       enum btree_iter_update_trigger_flags);
-int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct
-               disk_reservation *, int flags, enum
-               btree_iter_update_trigger_flags iter_flags);
-
-int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
-                                 struct bpos, struct bpos, unsigned, u64 *);
-int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
-                           struct bpos, struct bpos, unsigned, u64 *);
-
-int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool);
-int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
-int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
-
-static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
-                                               enum btree_id btree, struct bpos pos)
-{
-       return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
-                                    struct bpos, snapshot_id_list *);
-
-/*
- * For use when splitting extents in existing snapshots:
- *
- * If @old_pos is an interior snapshot node, iterate over descendent snapshot
- * nodes: for every descendent snapshot in whiche @old_pos is overwritten and
- * not visible, emit a whiteout at @new_pos.
- */
-static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-                                                enum btree_id btree,
-                                                struct bpos old_pos,
-                                                struct bpos new_pos)
-{
-       BUG_ON(old_pos.snapshot != new_pos.snapshot);
-
-       if (!btree_type_has_snapshots(btree) ||
-           bkey_eq(old_pos, new_pos))
-               return 0;
-
-       snapshot_id_list s;
-       int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s);
-       if (ret)
-               return ret;
-
-       return s.nr
-               ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s)
-               : 0;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
-                                      enum btree_iter_update_trigger_flags,
-                                      struct bkey_s_c, struct bkey_s_c);
-
-int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
-                            enum btree_id, struct bpos);
-
-int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *,
-                                     struct bkey_i *, enum btree_iter_update_trigger_flags,
-                                     unsigned long);
-
-static inline int __must_check
-bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-                 struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
-{
-       return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_);
-}
-
-static inline void *btree_trans_subbuf_base(struct btree_trans *trans,
-                                           struct btree_trans_subbuf *buf)
-{
-       return (u64 *) trans->mem + buf->base;
-}
-
-static inline void *btree_trans_subbuf_top(struct btree_trans *trans,
-                                          struct btree_trans_subbuf *buf)
-{
-       return (u64 *) trans->mem + buf->base + buf->u64s;
-}
-
-void *__bch2_trans_subbuf_alloc(struct btree_trans *,
-                               struct btree_trans_subbuf *,
-                               unsigned);
-
-static inline void *
-bch2_trans_subbuf_alloc(struct btree_trans *trans,
-                       struct btree_trans_subbuf *buf,
-                       unsigned u64s)
-{
-       if (buf->u64s + u64s > buf->size)
-               return __bch2_trans_subbuf_alloc(trans, buf, u64s);
-
-       void *p = btree_trans_subbuf_top(trans, buf);
-       buf->u64s += u64s;
-       return p;
-}
-
-static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans)
-{
-       return btree_trans_subbuf_base(trans, &trans->journal_entries);
-}
-
-static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
-{
-       return btree_trans_subbuf_top(trans, &trans->journal_entries);
-}
-
-static inline struct jset_entry *
-bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
-{
-       return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s);
-}
-
-int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
-
-int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *);
-
-static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
-                                           enum btree_id btree,
-                                           struct bkey_i *k)
-{
-       kmsan_check_memory(k, bkey_bytes(&k->k));
-
-       EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-
-       if (unlikely(!btree_type_uses_write_buffer(btree))) {
-               int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k);
-               dump_stack();
-               return ret;
-       }
-       /*
-        * Most updates skip the btree write buffer until journal replay is
-        * finished because synchronization with journal replay relies on having
-        * a btree node locked - if we're overwriting a key in the journal that
-        * journal replay hasn't yet replayed, we have to mark it as
-        * overwritten.
-        *
-        * But accounting updates don't overwrite, they're deltas, and they have
-        * to be flushed to the btree strictly in order for journal replay to be
-        * able to tell which updates need to be applied:
-        */
-       if (k->k.type != KEY_TYPE_accounting &&
-           unlikely(trans->journal_replay_not_finished))
-               return bch2_btree_insert_clone_trans(trans, btree, k);
-
-       struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
-       int ret = PTR_ERR_OR_ZERO(e);
-       if (ret)
-               return ret;
-
-       journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s);
-       bkey_copy(e->start, k);
-       return 0;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *,
-                           struct btree_trans_commit_hook *);
-int __bch2_trans_commit(struct btree_trans *, unsigned);
-
-int bch2_trans_log_str(struct btree_trans *, const char *);
-int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
-int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *);
-
-__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
-__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
-
-/**
- * bch2_trans_commit - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-static inline int bch2_trans_commit(struct btree_trans *trans,
-                                   struct disk_reservation *disk_res,
-                                   u64 *journal_seq,
-                                   unsigned flags)
-{
-       trans->disk_res         = disk_res;
-       trans->journal_seq      = journal_seq;
-
-       return __bch2_trans_commit(trans, flags);
-}
-
-#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)        \
-       lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
-                                       (_journal_seq), (_flags)))
-
-#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
-       nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
-                                       (_journal_seq), (_flags)))
-
-#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do)         \
-       bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
-
-#define trans_for_each_update(_trans, _i)                              \
-       for (struct btree_insert_entry *_i = (_trans)->updates;         \
-            (_i) < (_trans)->updates + (_trans)->nr_updates;           \
-            (_i)++)
-
-static inline void bch2_trans_reset_updates(struct btree_trans *trans)
-{
-       trans_for_each_update(trans, i)
-               bch2_path_put(trans, i->path, true);
-
-       trans->nr_updates               = 0;
-       trans->journal_entries.u64s     = 0;
-       trans->journal_entries.size     = 0;
-       trans->accounting.u64s          = 0;
-       trans->accounting.size          = 0;
-       trans->hooks                    = NULL;
-       trans->extra_disk_res           = 0;
-}
-
-static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
-                                                 unsigned type, unsigned min_bytes)
-{
-       unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
-       struct bkey_i *mut;
-
-       if (type && k.k->type != type)
-               return ERR_PTR(-ENOENT);
-
-       /* extra padding for varint_decode_fast... */
-       mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8);
-       if (!IS_ERR(mut)) {
-               bkey_reassemble(mut, k);
-
-               if (unlikely(bytes > bkey_bytes(k.k))) {
-                       memset((void *) mut + bkey_bytes(k.k), 0,
-                              bytes - bkey_bytes(k.k));
-                       mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64));
-               }
-       }
-       return mut;
-}
-
-static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
-{
-       return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
-}
-
-#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type)           \
-       bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k,     \
-                               KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
-                                       struct bkey_s_c *k,
-                                       enum btree_iter_update_trigger_flags flags,
-                                       unsigned type, unsigned min_bytes)
-{
-       struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
-       int ret;
-
-       if (IS_ERR(mut))
-               return mut;
-
-       ret = bch2_trans_update(trans, iter, mut, flags);
-       if (ret)
-               return ERR_PTR(ret);
-
-       *k = bkey_i_to_s_c(mut);
-       return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans,
-                                               struct btree_iter *iter, struct bkey_s_c *k,
-                                               enum btree_iter_update_trigger_flags flags)
-{
-       return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
-}
-
-#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type)     \
-       bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\
-                               KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
-                                        struct btree_iter *iter,
-                                        unsigned btree_id, struct bpos pos,
-                                        enum btree_iter_update_trigger_flags flags,
-                                        unsigned type, unsigned min_bytes)
-{
-       struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
-                               btree_id, pos, flags|BTREE_ITER_intent, type);
-       struct bkey_i *ret = IS_ERR(k.k)
-               ? ERR_CAST(k.k)
-               : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
-       if (IS_ERR(ret))
-               bch2_trans_iter_exit(trans, iter);
-       return ret;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
-                                              struct btree_iter *iter,
-                                              unsigned btree_id, struct bpos pos,
-                                              enum btree_iter_update_trigger_flags flags)
-{
-       return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
-}
-
-static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
-                                        struct btree_iter *iter,
-                                        unsigned btree_id, struct bpos pos,
-                                        enum btree_iter_update_trigger_flags flags,
-                                        unsigned type, unsigned min_bytes)
-{
-       struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
-                               btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
-       int ret;
-
-       if (IS_ERR(mut))
-               return mut;
-
-       ret = bch2_trans_update(trans, iter, mut, flags);
-       if (ret) {
-               bch2_trans_iter_exit(trans, iter);
-               return ERR_PTR(ret);
-       }
-
-       return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
-                                                      struct btree_iter *iter,
-                                                      unsigned btree_id, struct bpos pos,
-                                                      enum btree_iter_update_trigger_flags flags,
-                                                      unsigned min_bytes)
-{
-       return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
-                                              struct btree_iter *iter,
-                                              unsigned btree_id, struct bpos pos,
-                                              enum btree_iter_update_trigger_flags flags)
-{
-       return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
-}
-
-#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
-       bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter,            \
-                       _btree_id, _pos, _flags,                        \
-                       KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
-                                              enum btree_iter_update_trigger_flags flags,
-                                              unsigned type, unsigned val_size)
-{
-       struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
-       int ret;
-
-       if (IS_ERR(k))
-               return k;
-
-       bkey_init(&k->k);
-       k->k.p = iter->pos;
-       k->k.type = type;
-       set_bkey_val_bytes(&k->k, val_size);
-
-       ret = bch2_trans_update(trans, iter, k, flags);
-       if (unlikely(ret))
-               return ERR_PTR(ret);
-       return k;
-}
-
-#define bch2_bkey_alloc(_trans, _iter, _flags, _type)                  \
-       bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags,      \
-                               KEY_TYPE_##_type, sizeof(struct bch_##_type)))
-
-#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
deleted file mode 100644 (file)
index 553059b..0000000
+++ /dev/null
@@ -1,2854 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_journal_iter.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "clock.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/random.h>
-
-static const char * const bch2_btree_update_modes[] = {
-#define x(t) #t,
-       BTREE_UPDATE_MODES()
-#undef x
-       NULL
-};
-
-static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *);
-
-static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-                                 btree_path_idx_t, struct btree *, struct keylist *);
-static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-
-/*
- * Verify that child nodes correctly span parent node's range:
- */
-int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
-{
-       struct bch_fs *c = trans->c;
-       struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
-               ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
-               : b->data->min_key;
-       struct btree_and_journal_iter iter;
-       struct bkey_s_c k;
-       struct printbuf buf = PRINTBUF;
-       struct bkey_buf prev;
-       int ret = 0;
-
-       BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-              !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
-                       b->data->min_key));
-
-       bch2_bkey_buf_init(&prev);
-       bkey_init(&prev.k->k);
-       bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-
-       if (b == btree_node_root(c, b)) {
-               if (!bpos_eq(b->data->min_key, POS_MIN)) {
-                       bch2_log_msg_start(c, &buf);
-                       prt_printf(&buf, "btree root with incorrect min_key: ");
-                       bch2_bpos_to_text(&buf, b->data->min_key);
-                       prt_newline(&buf);
-
-                       bch2_count_fsck_err(c, btree_root_bad_min_key, &buf);
-                       goto err;
-               }
-
-               if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
-                       bch2_log_msg_start(c, &buf);
-                       prt_printf(&buf, "btree root with incorrect max_key: ");
-                       bch2_bpos_to_text(&buf, b->data->max_key);
-                       prt_newline(&buf);
-
-                       bch2_count_fsck_err(c, btree_root_bad_max_key, &buf);
-                       goto err;
-               }
-       }
-
-       if (!b->c.level)
-               goto out;
-
-       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-               if (k.k->type != KEY_TYPE_btree_ptr_v2)
-                       goto out;
-
-               struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
-               struct bpos expected_min = bkey_deleted(&prev.k->k)
-                       ? node_min
-                       : bpos_successor(prev.k->k.p);
-
-               if (!bpos_eq(expected_min, bp.v->min_key)) {
-                       prt_str(&buf, "end of prev node doesn't match start of next node");
-                       prt_str(&buf, "\nprev ");
-                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
-                       prt_str(&buf, "\nnext ");
-                       bch2_bkey_val_to_text(&buf, c, k);
-                       prt_newline(&buf);
-
-                       bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf);
-                       goto err;
-               }
-
-               bch2_bkey_buf_reassemble(&prev, c, k);
-               bch2_btree_and_journal_iter_advance(&iter);
-       }
-
-       if (bkey_deleted(&prev.k->k)) {
-               prt_printf(&buf, "empty interior node\n");
-               bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf);
-               goto err;
-       }
-
-       if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
-               prt_str(&buf, "last child node doesn't end at end of parent node\nchild: ");
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
-               prt_newline(&buf);
-
-               bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf);
-               goto err;
-       }
-out:
-       bch2_btree_and_journal_iter_exit(&iter);
-       bch2_bkey_buf_exit(&prev, c);
-       printbuf_exit(&buf);
-       return ret;
-err:
-       bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-       prt_char(&buf, ' ');
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-       prt_newline(&buf);
-
-       ret = __bch2_topology_error(c, &buf);
-       bch2_print_str(c, KERN_ERR, buf.buf);
-       BUG_ON(!ret);
-       goto out;
-}
-
-/* Calculate ideal packed bkey format for new btree nodes: */
-
-static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
-{
-       struct bkey_packed *k;
-       struct bkey uk;
-
-       for_each_bset(b, t)
-               bset_tree_for_each_key(b, t, k)
-                       if (!bkey_deleted(k)) {
-                               uk = bkey_unpack_key(b, k);
-                               bch2_bkey_format_add_key(s, &uk);
-                       }
-}
-
-static struct bkey_format bch2_btree_calc_format(struct btree *b)
-{
-       struct bkey_format_state s;
-
-       bch2_bkey_format_init(&s);
-       bch2_bkey_format_add_pos(&s, b->data->min_key);
-       bch2_bkey_format_add_pos(&s, b->data->max_key);
-       __bch2_btree_calc_format(&s, b);
-
-       return bch2_bkey_format_done(&s);
-}
-
-static size_t btree_node_u64s_with_format(struct btree_nr_keys nr,
-                                         struct bkey_format *old_f,
-                                         struct bkey_format *new_f)
-{
-       /* stupid integer promotion rules */
-       ssize_t delta =
-           (((int) new_f->key_u64s - old_f->key_u64s) *
-            (int) nr.packed_keys) +
-           (((int) new_f->key_u64s - BKEY_U64s) *
-            (int) nr.unpacked_keys);
-
-       BUG_ON(delta + nr.live_u64s < 0);
-
-       return nr.live_u64s + delta;
-}
-
-/**
- * bch2_btree_node_format_fits - check if we could rewrite node with a new format
- *
- * @c:         filesystem handle
- * @b:         btree node to rewrite
- * @nr:                number of keys for new node (i.e. b->nr)
- * @new_f:     bkey format to translate keys to
- *
- * Returns: true if all re-packed keys will be able to fit in a new node.
- *
- * Assumes all keys will successfully pack with the new format.
- */
-static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
-                                struct btree_nr_keys nr,
-                                struct bkey_format *new_f)
-{
-       size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
-
-       return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
-}
-
-/* Btree node freeing/allocation: */
-
-static void __btree_node_free(struct btree_trans *trans, struct btree *b)
-{
-       struct bch_fs *c = trans->c;
-
-       trace_and_count(c, btree_node_free, trans, b);
-
-       BUG_ON(btree_node_write_blocked(b));
-       BUG_ON(btree_node_dirty(b));
-       BUG_ON(btree_node_need_write(b));
-       BUG_ON(b == btree_node_root(c, b));
-       BUG_ON(b->ob.nr);
-       BUG_ON(!list_empty(&b->write_blocked));
-       BUG_ON(b->will_make_reachable);
-
-       clear_btree_node_noevict(b);
-}
-
-static void bch2_btree_node_free_inmem(struct btree_trans *trans,
-                                      struct btree_path *path,
-                                      struct btree *b)
-{
-       struct bch_fs *c = trans->c;
-
-       bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-
-       __btree_node_free(trans, b);
-
-       mutex_lock(&c->btree_cache.lock);
-       bch2_btree_node_hash_remove(&c->btree_cache, b);
-       mutex_unlock(&c->btree_cache.lock);
-
-       six_unlock_write(&b->c.lock);
-       mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-
-       bch2_trans_node_drop(trans, b);
-}
-
-static void bch2_btree_node_free_never_used(struct btree_update *as,
-                                           struct btree_trans *trans,
-                                           struct btree *b)
-{
-       struct bch_fs *c = as->c;
-       struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
-
-       BUG_ON(!list_empty(&b->write_blocked));
-       BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
-
-       b->will_make_reachable = 0;
-       closure_put(&as->cl);
-
-       clear_btree_node_will_make_reachable(b);
-       clear_btree_node_accessed(b);
-       clear_btree_node_dirty_acct(c, b);
-       clear_btree_node_need_write(b);
-
-       mutex_lock(&c->btree_cache.lock);
-       __bch2_btree_node_hash_remove(&c->btree_cache, b);
-       mutex_unlock(&c->btree_cache.lock);
-
-       BUG_ON(p->nr >= ARRAY_SIZE(p->b));
-       p->b[p->nr++] = b;
-
-       six_unlock_intent(&b->c.lock);
-
-       bch2_trans_node_drop(trans, b);
-}
-
-static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
-                                            struct disk_reservation *res,
-                                            struct closure *cl,
-                                            bool interior_node,
-                                            unsigned target,
-                                            unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct write_point *wp;
-       struct btree *b;
-       BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-       struct open_buckets obs = { .nr = 0 };
-       struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
-       enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-       unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
-               ? BTREE_NODE_RESERVE
-               : 0;
-       int ret;
-
-       b = bch2_btree_node_mem_alloc(trans, interior_node);
-       if (IS_ERR(b))
-               return b;
-
-       BUG_ON(b->ob.nr);
-
-       mutex_lock(&c->btree_reserve_cache_lock);
-       if (c->btree_reserve_cache_nr > nr_reserve) {
-               struct btree_alloc *a =
-                       &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-               obs = a->ob;
-               bkey_copy(&tmp.k, &a->k);
-               mutex_unlock(&c->btree_reserve_cache_lock);
-               goto out;
-       }
-       mutex_unlock(&c->btree_reserve_cache_lock);
-retry:
-       ret = bch2_alloc_sectors_start_trans(trans,
-                                     target ?:
-                                     c->opts.metadata_target ?:
-                                     c->opts.foreground_target,
-                                     0,
-                                     writepoint_ptr(&c->btree_write_point),
-                                     &devs_have,
-                                     res->nr_replicas,
-                                     min(res->nr_replicas,
-                                         c->opts.metadata_replicas_required),
-                                     watermark,
-                                     target ? BCH_WRITE_only_specified_devs : 0,
-                                     cl, &wp);
-       if (unlikely(ret))
-               goto err;
-
-       if (wp->sectors_free < btree_sectors(c)) {
-               struct open_bucket *ob;
-               unsigned i;
-
-               open_bucket_for_each(c, &wp->ptrs, ob, i)
-                       if (ob->sectors_free < btree_sectors(c))
-                               ob->sectors_free = 0;
-
-               bch2_alloc_sectors_done(c, wp);
-               goto retry;
-       }
-
-       bkey_btree_ptr_v2_init(&tmp.k);
-       bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
-
-       bch2_open_bucket_get(c, wp, &obs);
-       bch2_alloc_sectors_done(c, wp);
-out:
-       bkey_copy(&b->key, &tmp.k);
-       b->ob = obs;
-       six_unlock_write(&b->c.lock);
-       six_unlock_intent(&b->c.lock);
-
-       return b;
-err:
-       bch2_btree_node_to_freelist(c, b);
-       return ERR_PTR(ret);
-}
-
-static struct btree *bch2_btree_node_alloc(struct btree_update *as,
-                                          struct btree_trans *trans,
-                                          unsigned level)
-{
-       struct bch_fs *c = as->c;
-       struct btree *b;
-       struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
-       int ret;
-
-       BUG_ON(level >= BTREE_MAX_DEPTH);
-       BUG_ON(!p->nr);
-
-       b = p->b[--p->nr];
-
-       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-
-       set_btree_node_accessed(b);
-       set_btree_node_dirty_acct(c, b);
-       set_btree_node_need_write(b);
-
-       bch2_bset_init_first(b, &b->data->keys);
-       b->c.level      = level;
-       b->c.btree_id   = as->btree_id;
-       b->version_ondisk = c->sb.version;
-
-       memset(&b->nr, 0, sizeof(b->nr));
-       b->data->magic = cpu_to_le64(bset_magic(c));
-       memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
-       b->data->flags = 0;
-       SET_BTREE_NODE_ID(b->data, as->btree_id);
-       SET_BTREE_NODE_LEVEL(b->data, level);
-
-       if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-               struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
-
-               bp->v.mem_ptr           = 0;
-               bp->v.seq               = b->data->keys.seq;
-               bp->v.sectors_written   = 0;
-       }
-
-       SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
-
-       bch2_btree_build_aux_trees(b);
-
-       ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
-       BUG_ON(ret);
-
-       trace_and_count(c, btree_node_alloc, trans, b);
-       bch2_increment_clock(c, btree_sectors(c), WRITE);
-       return b;
-}
-
-static void btree_set_min(struct btree *b, struct bpos pos)
-{
-       if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
-               bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
-       b->data->min_key = pos;
-}
-
-static void btree_set_max(struct btree *b, struct bpos pos)
-{
-       b->key.k.p = pos;
-       b->data->max_key = pos;
-}
-
-static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
-                                                      struct btree_trans *trans,
-                                                      struct btree *b)
-{
-       struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
-       struct bkey_format format = bch2_btree_calc_format(b);
-
-       /*
-        * The keys might expand with the new format - if they wouldn't fit in
-        * the btree node anymore, use the old format for now:
-        */
-       if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
-               format = b->format;
-
-       SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
-
-       btree_set_min(n, b->data->min_key);
-       btree_set_max(n, b->data->max_key);
-
-       n->data->format         = format;
-       btree_node_set_format(n, format);
-
-       bch2_btree_sort_into(as->c, n, b);
-
-       btree_node_reset_sib_u64s(n);
-       return n;
-}
-
-static struct btree *__btree_root_alloc(struct btree_update *as,
-                               struct btree_trans *trans, unsigned level)
-{
-       struct btree *b = bch2_btree_node_alloc(as, trans, level);
-
-       btree_set_min(b, POS_MIN);
-       btree_set_max(b, SPOS_MAX);
-       b->data->format = bch2_btree_calc_format(b);
-
-       btree_node_set_format(b, b->data->format);
-       bch2_btree_build_aux_trees(b);
-
-       return b;
-}
-
-static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
-{
-       struct bch_fs *c = as->c;
-       struct prealloc_nodes *p;
-
-       for (p = as->prealloc_nodes;
-            p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
-            p++) {
-               while (p->nr) {
-                       struct btree *b = p->b[--p->nr];
-
-                       mutex_lock(&c->btree_reserve_cache_lock);
-
-                       if (c->btree_reserve_cache_nr <
-                           ARRAY_SIZE(c->btree_reserve_cache)) {
-                               struct btree_alloc *a =
-                                       &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
-
-                               a->ob = b->ob;
-                               b->ob.nr = 0;
-                               bkey_copy(&a->k, &b->key);
-                       } else {
-                               bch2_open_buckets_put(c, &b->ob);
-                       }
-
-                       mutex_unlock(&c->btree_reserve_cache_lock);
-
-                       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-                       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-                       __btree_node_free(trans, b);
-                       bch2_btree_node_to_freelist(c, b);
-               }
-       }
-}
-
-static int bch2_btree_reserve_get(struct btree_trans *trans,
-                                 struct btree_update *as,
-                                 unsigned nr_nodes[2],
-                                 unsigned target,
-                                 unsigned flags,
-                                 struct closure *cl)
-{
-       struct btree *b;
-       unsigned interior;
-       int ret = 0;
-
-       BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
-
-       /*
-        * Protects reaping from the btree node cache and using the btree node
-        * open bucket reserve:
-        */
-       ret = bch2_btree_cache_cannibalize_lock(trans, cl);
-       if (ret)
-               return ret;
-
-       for (interior = 0; interior < 2; interior++) {
-               struct prealloc_nodes *p = as->prealloc_nodes + interior;
-
-               while (p->nr < nr_nodes[interior]) {
-                       b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
-                                                   interior, target, flags);
-                       if (IS_ERR(b)) {
-                               ret = PTR_ERR(b);
-                               goto err;
-                       }
-
-                       p->b[p->nr++] = b;
-               }
-       }
-err:
-       bch2_btree_cache_cannibalize_unlock(trans);
-       return ret;
-}
-
-/* Asynchronous interior node update machinery */
-
-static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
-{
-       struct bch_fs *c = as->c;
-
-       if (as->took_gc_lock)
-               up_read(&c->gc_lock);
-       as->took_gc_lock = false;
-
-       bch2_journal_pin_drop(&c->journal, &as->journal);
-       bch2_journal_pin_flush(&c->journal, &as->journal);
-       bch2_disk_reservation_put(c, &as->disk_res);
-       bch2_btree_reserve_put(as, trans);
-
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
-                              as->start_time);
-
-       mutex_lock(&c->btree_interior_update_lock);
-       list_del(&as->unwritten_list);
-       list_del(&as->list);
-
-       closure_debug_destroy(&as->cl);
-       mempool_free(as, &c->btree_interior_update_pool);
-
-       /*
-        * Have to do the wakeup with btree_interior_update_lock still held,
-        * since being on btree_interior_update_list is our ref on @c:
-        */
-       closure_wake_up(&c->btree_interior_update_wait);
-
-       mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static void btree_update_add_key(struct btree_update *as,
-                                struct keylist *keys, struct btree *b)
-{
-       struct bkey_i *k = &b->key;
-
-       BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
-              ARRAY_SIZE(as->_old_keys));
-
-       bkey_copy(keys->top, k);
-       bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
-
-       bch2_keylist_push(keys);
-}
-
-static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
-{
-       for_each_keylist_key(&as->new_keys, k)
-               if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
-                       return false;
-       return true;
-}
-
-static void btree_update_new_nodes_mark_sb(struct btree_update *as)
-{
-       struct bch_fs *c = as->c;
-
-       mutex_lock(&c->sb_lock);
-       for_each_keylist_key(&as->new_keys, k)
-               bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-}
-
-/*
- * The transactional part of an interior btree node update, where we journal the
- * update we did to the interior node and update alloc info:
- */
-static int btree_update_nodes_written_trans(struct btree_trans *trans,
-                                           struct btree_update *as)
-{
-       struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
-       int ret = PTR_ERR_OR_ZERO(e);
-       if (ret)
-               return ret;
-
-       memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
-
-       trans->journal_pin = &as->journal;
-
-       for_each_keylist_key(&as->old_keys, k) {
-               unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
-
-               ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
-                                          BTREE_TRIGGER_transactional);
-               if (ret)
-                       return ret;
-       }
-
-       for_each_keylist_key(&as->new_keys, k) {
-               unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
-
-               ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
-                                          BTREE_TRIGGER_transactional);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-/* If the node has been reused, we might be reading uninitialized memory - that's fine: */
-static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq)
-{
-       struct btree_node *b_data = READ_ONCE(b->data);
-
-       return (b_data ? b_data->keys.seq : 0) == seq;
-}
-
-static void btree_update_nodes_written(struct btree_update *as)
-{
-       struct bch_fs *c = as->c;
-       struct btree *b;
-       struct btree_trans *trans = bch2_trans_get(c);
-       u64 journal_seq = 0;
-       unsigned i;
-       int ret;
-
-       /*
-        * If we're already in an error state, it might be because a btree node
-        * was never written, and we might be trying to free that same btree
-        * node here, but it won't have been marked as allocated and we'll see
-        * spurious disk usage inconsistencies in the transactional part below
-        * if we don't skip it:
-        */
-       ret = bch2_journal_error(&c->journal);
-       if (ret)
-               goto err;
-
-       if (!btree_update_new_nodes_marked_sb(as))
-               btree_update_new_nodes_mark_sb(as);
-
-       /*
-        * Wait for any in flight writes to finish before we free the old nodes
-        * on disk. But we haven't pinned those old nodes in the btree cache,
-        * they might have already been evicted.
-        *
-        * The update we're completing deleted references to those nodes from the
-        * btree, so we know if they've been evicted they can't be pulled back in.
-        * We just have to check if the nodes we have pointers to are still those
-        * old nodes, and haven't been reused.
-        *
-        * This can't be done locklessly because the data buffer might have been
-        * vmalloc allocated, and they're not RCU freed. We also need the
-        * __no_kmsan_checks annotation because even with the btree node read
-        * lock, nothing tells us that the data buffer has been initialized (if
-        * the btree node has been reused for a different node, and the data
-        * buffer swapped for a new data buffer).
-        */
-       for (i = 0; i < as->nr_old_nodes; i++) {
-               b = as->old_nodes[i];
-
-               bch2_trans_begin(trans);
-               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-               bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]);
-               six_unlock_read(&b->c.lock);
-               bch2_trans_unlock_long(trans);
-
-               if (seq_matches)
-                       wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
-                                      TASK_UNINTERRUPTIBLE);
-       }
-
-       /*
-        * We did an update to a parent node where the pointers we added pointed
-        * to child nodes that weren't written yet: now, the child nodes have
-        * been written so we can write out the update to the interior node.
-        */
-
-       /*
-        * We can't call into journal reclaim here: we'd block on the journal
-        * reclaim lock, but we may need to release the open buckets we have
-        * pinned in order for other btree updates to make forward progress, and
-        * journal reclaim does btree updates when flushing bkey_cached entries,
-        * which may require allocations as well.
-        */
-       ret = commit_do(trans, &as->disk_res, &journal_seq,
-                       BCH_WATERMARK_interior_updates|
-                       BCH_TRANS_COMMIT_no_enospc|
-                       BCH_TRANS_COMMIT_no_check_rw|
-                       BCH_TRANS_COMMIT_journal_reclaim,
-                       btree_update_nodes_written_trans(trans, as));
-       bch2_trans_unlock(trans);
-
-       bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
-                            "%s", bch2_err_str(ret));
-err:
-       /*
-        * Ensure transaction is unlocked before using btree_node_lock_nopath()
-        * (the use of which is always suspect, we need to work on removing this
-        * in the future)
-        *
-        * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
-        * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
-        * rarely end up with a locked path besides the one we have here:
-        */
-       bch2_trans_unlock(trans);
-       bch2_trans_begin(trans);
-
-       /*
-        * We have to be careful because another thread might be getting ready
-        * to free as->b and calling btree_update_reparent() on us - we'll
-        * recheck under btree_update_lock below:
-        */
-       b = READ_ONCE(as->b);
-       if (b) {
-               /*
-                * @b is the node we did the final insert into:
-                *
-                * On failure to get a journal reservation, we still have to
-                * unblock the write and allow most of the write path to happen
-                * so that shutdown works, but the i->journal_seq mechanism
-                * won't work to prevent the btree write from being visible (we
-                * didn't get a journal sequence number) - instead
-                * __bch2_btree_node_write() doesn't do the actual write if
-                * we're in journal error state:
-                */
-
-               btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
-                                               as->btree_id, b->c.level, b->key.k.p);
-               struct btree_path *path = trans->paths + path_idx;
-               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-               mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-               path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
-               path->l[b->c.level].b = b;
-
-               bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-
-               mutex_lock(&c->btree_interior_update_lock);
-
-               list_del(&as->write_blocked_list);
-               if (list_empty(&b->write_blocked))
-                       clear_btree_node_write_blocked(b);
-
-               /*
-                * Node might have been freed, recheck under
-                * btree_interior_update_lock:
-                */
-               if (as->b == b) {
-                       BUG_ON(!b->c.level);
-                       BUG_ON(!btree_node_dirty(b));
-
-                       if (!ret) {
-                               struct bset *last = btree_bset_last(b);
-
-                               last->journal_seq = cpu_to_le64(
-                                                            max(journal_seq,
-                                                                le64_to_cpu(last->journal_seq)));
-
-                               bch2_btree_add_journal_pin(c, b, journal_seq);
-                       } else {
-                               /*
-                                * If we didn't get a journal sequence number we
-                                * can't write this btree node, because recovery
-                                * won't know to ignore this write:
-                                */
-                               set_btree_node_never_write(b);
-                       }
-               }
-
-               mutex_unlock(&c->btree_interior_update_lock);
-
-               mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-               six_unlock_write(&b->c.lock);
-
-               btree_node_write_if_need(trans, b, SIX_LOCK_intent);
-               btree_node_unlock(trans, path, b->c.level);
-               bch2_path_put(trans, path_idx, true);
-       }
-
-       bch2_journal_pin_drop(&c->journal, &as->journal);
-
-       mutex_lock(&c->btree_interior_update_lock);
-       for (i = 0; i < as->nr_new_nodes; i++) {
-               b = as->new_nodes[i];
-
-               BUG_ON(b->will_make_reachable != (unsigned long) as);
-               b->will_make_reachable = 0;
-               clear_btree_node_will_make_reachable(b);
-       }
-       mutex_unlock(&c->btree_interior_update_lock);
-
-       for (i = 0; i < as->nr_new_nodes; i++) {
-               b = as->new_nodes[i];
-
-               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-               btree_node_write_if_need(trans, b, SIX_LOCK_read);
-               six_unlock_read(&b->c.lock);
-       }
-
-       for (i = 0; i < as->nr_open_buckets; i++)
-               bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
-
-       bch2_btree_update_free(as, trans);
-       bch2_trans_put(trans);
-}
-
-static void btree_interior_update_work(struct work_struct *work)
-{
-       struct bch_fs *c =
-               container_of(work, struct bch_fs, btree_interior_update_work);
-       struct btree_update *as;
-
-       while (1) {
-               mutex_lock(&c->btree_interior_update_lock);
-               as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
-                                             struct btree_update, unwritten_list);
-               if (as && !as->nodes_written)
-                       as = NULL;
-               mutex_unlock(&c->btree_interior_update_lock);
-
-               if (!as)
-                       break;
-
-               btree_update_nodes_written(as);
-       }
-}
-
-static CLOSURE_CALLBACK(btree_update_set_nodes_written)
-{
-       closure_type(as, struct btree_update, cl);
-       struct bch_fs *c = as->c;
-
-       mutex_lock(&c->btree_interior_update_lock);
-       as->nodes_written = true;
-       mutex_unlock(&c->btree_interior_update_lock);
-
-       queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
-}
-
-/*
- * We're updating @b with pointers to nodes that haven't finished writing yet:
- * block @b from being written until @as completes
- */
-static void btree_update_updated_node(struct btree_update *as, struct btree *b)
-{
-       struct bch_fs *c = as->c;
-
-       BUG_ON(as->mode != BTREE_UPDATE_none);
-       BUG_ON(as->update_level_end < b->c.level);
-       BUG_ON(!btree_node_dirty(b));
-       BUG_ON(!b->c.level);
-
-       mutex_lock(&c->btree_interior_update_lock);
-       list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
-       as->mode        = BTREE_UPDATE_node;
-       as->b           = b;
-       as->update_level_end = b->c.level;
-
-       set_btree_node_write_blocked(b);
-       list_add(&as->write_blocked_list, &b->write_blocked);
-
-       mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static int bch2_update_reparent_journal_pin_flush(struct journal *j,
-                               struct journal_entry_pin *_pin, u64 seq)
-{
-       return 0;
-}
-
-static void btree_update_reparent(struct btree_update *as,
-                                 struct btree_update *child)
-{
-       struct bch_fs *c = as->c;
-
-       lockdep_assert_held(&c->btree_interior_update_lock);
-
-       child->b = NULL;
-       child->mode = BTREE_UPDATE_update;
-
-       bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
-                             bch2_update_reparent_journal_pin_flush);
-}
-
-static void btree_update_updated_root(struct btree_update *as, struct btree *b)
-{
-       struct bkey_i *insert = &b->key;
-       struct bch_fs *c = as->c;
-
-       BUG_ON(as->mode != BTREE_UPDATE_none);
-
-       BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
-              ARRAY_SIZE(as->journal_entries));
-
-       as->journal_u64s +=
-               journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
-                                 BCH_JSET_ENTRY_btree_root,
-                                 b->c.btree_id, b->c.level,
-                                 insert, insert->k.u64s);
-
-       mutex_lock(&c->btree_interior_update_lock);
-       list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
-       as->mode        = BTREE_UPDATE_root;
-       mutex_unlock(&c->btree_interior_update_lock);
-}
-
-/*
- * bch2_btree_update_add_new_node:
- *
- * This causes @as to wait on @b to be written, before it gets to
- * bch2_btree_update_nodes_written
- *
- * Additionally, it sets b->will_make_reachable to prevent any additional writes
- * to @b from happening besides the first until @b is reachable on disk
- *
- * And it adds @b to the list of @as's new nodes, so that we can update sector
- * counts in bch2_btree_update_nodes_written:
- */
-static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
-{
-       struct bch_fs *c = as->c;
-
-       closure_get(&as->cl);
-
-       mutex_lock(&c->btree_interior_update_lock);
-       BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
-       BUG_ON(b->will_make_reachable);
-
-       as->new_nodes[as->nr_new_nodes++] = b;
-       b->will_make_reachable = 1UL|(unsigned long) as;
-       set_btree_node_will_make_reachable(b);
-
-       mutex_unlock(&c->btree_interior_update_lock);
-
-       btree_update_add_key(as, &as->new_keys, b);
-
-       if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-               unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
-               unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
-
-               bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
-                       cpu_to_le16(sectors);
-       }
-}
-
-/*
- * returns true if @b was a new node
- */
-static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
-{
-       struct btree_update *as;
-       unsigned long v;
-       unsigned i;
-
-       mutex_lock(&c->btree_interior_update_lock);
-       /*
-        * When b->will_make_reachable != 0, it owns a ref on as->cl that's
-        * dropped when it gets written by bch2_btree_complete_write - the
-        * xchg() is for synchronization with bch2_btree_complete_write:
-        */
-       v = xchg(&b->will_make_reachable, 0);
-       clear_btree_node_will_make_reachable(b);
-       as = (struct btree_update *) (v & ~1UL);
-
-       if (!as) {
-               mutex_unlock(&c->btree_interior_update_lock);
-               return;
-       }
-
-       for (i = 0; i < as->nr_new_nodes; i++)
-               if (as->new_nodes[i] == b)
-                       goto found;
-
-       BUG();
-found:
-       array_remove_item(as->new_nodes, as->nr_new_nodes, i);
-       mutex_unlock(&c->btree_interior_update_lock);
-
-       if (v & 1)
-               closure_put(&as->cl);
-}
-
-static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
-{
-       while (b->ob.nr)
-               as->open_buckets[as->nr_open_buckets++] =
-                       b->ob.v[--b->ob.nr];
-}
-
-static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
-                               struct journal_entry_pin *_pin, u64 seq)
-{
-       return 0;
-}
-
-/*
- * @b is being split/rewritten: it may have pointers to not-yet-written btree
- * nodes and thus outstanding btree_updates - redirect @b's
- * btree_updates to point to this btree_update:
- */
-static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
-                                                     struct btree *b)
-{
-       struct bch_fs *c = as->c;
-       struct btree_update *p, *n;
-       struct btree_write *w;
-
-       set_btree_node_dying(b);
-
-       if (btree_node_fake(b))
-               return;
-
-       mutex_lock(&c->btree_interior_update_lock);
-
-       /*
-        * Does this node have any btree_update operations preventing
-        * it from being written?
-        *
-        * If so, redirect them to point to this btree_update: we can
-        * write out our new nodes, but we won't make them visible until those
-        * operations complete
-        */
-       list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
-               list_del_init(&p->write_blocked_list);
-               btree_update_reparent(as, p);
-
-               /*
-                * for flush_held_btree_writes() waiting on updates to flush or
-                * nodes to be writeable:
-                */
-               closure_wake_up(&c->btree_interior_update_wait);
-       }
-
-       clear_btree_node_dirty_acct(c, b);
-       clear_btree_node_need_write(b);
-       clear_btree_node_write_blocked(b);
-
-       /*
-        * Does this node have unwritten data that has a pin on the journal?
-        *
-        * If so, transfer that pin to the btree_update operation -
-        * note that if we're freeing multiple nodes, we only need to keep the
-        * oldest pin of any of the nodes we're freeing. We'll release the pin
-        * when the new nodes are persistent and reachable on disk:
-        */
-       w = btree_current_write(b);
-       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
-                             bch2_btree_update_will_free_node_journal_pin_flush);
-       bch2_journal_pin_drop(&c->journal, &w->journal);
-
-       w = btree_prev_write(b);
-       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
-                             bch2_btree_update_will_free_node_journal_pin_flush);
-       bch2_journal_pin_drop(&c->journal, &w->journal);
-
-       mutex_unlock(&c->btree_interior_update_lock);
-
-       /*
-        * Is this a node that isn't reachable on disk yet?
-        *
-        * Nodes that aren't reachable yet have writes blocked until they're
-        * reachable - now that we've cancelled any pending writes and moved
-        * things waiting on that write to wait on this update, we can drop this
-        * node from the list of nodes that the other update is making
-        * reachable, prior to freeing it:
-        */
-       btree_update_drop_new_node(c, b);
-
-       btree_update_add_key(as, &as->old_keys, b);
-
-       as->old_nodes[as->nr_old_nodes] = b;
-       as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
-       as->nr_old_nodes++;
-}
-
-static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
-{
-       struct bch_fs *c = as->c;
-       u64 start_time = as->start_time;
-
-       BUG_ON(as->mode == BTREE_UPDATE_none);
-
-       if (as->took_gc_lock)
-               up_read(&as->c->gc_lock);
-       as->took_gc_lock = false;
-
-       bch2_btree_reserve_put(as, trans);
-
-       continue_at(&as->cl, btree_update_set_nodes_written,
-                   as->c->btree_interior_update_worker);
-
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
-                              start_time);
-}
-
-static const char * const btree_node_reawrite_reason_strs[] = {
-#define x(n)   #n,
-       BTREE_NODE_REWRITE_REASON()
-#undef x
-       NULL,
-};
-
-static struct btree_update *
-bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
-                       unsigned level_start, bool split,
-                       unsigned target, unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_update *as;
-       u64 start_time = local_clock();
-       int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
-               ? BCH_DISK_RESERVATION_NOFAIL : 0;
-       unsigned nr_nodes[2] = { 0, 0 };
-       unsigned level_end = level_start;
-       enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-       int ret = 0;
-       u32 restart_count = trans->restart_count;
-
-       BUG_ON(!path->should_be_locked);
-
-       if (watermark == BCH_WATERMARK_copygc)
-               watermark = BCH_WATERMARK_btree_copygc;
-       if (watermark < BCH_WATERMARK_btree)
-               watermark = BCH_WATERMARK_btree;
-
-       flags &= ~BCH_WATERMARK_MASK;
-       flags |= watermark;
-
-       if (watermark < BCH_WATERMARK_reclaim &&
-           test_bit(JOURNAL_space_low, &c->journal.flags)) {
-               if (flags & BCH_TRANS_COMMIT_journal_reclaim)
-                       return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
-
-               ret = drop_locks_do(trans,
-                       ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; }));
-               if (ret)
-                       return ERR_PTR(ret);
-       }
-
-       while (1) {
-               nr_nodes[!!level_end] += 1 + split;
-               level_end++;
-
-               ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
-               if (ret)
-                       return ERR_PTR(ret);
-
-               if (!btree_path_node(path, level_end)) {
-                       /* Allocating new root? */
-                       nr_nodes[1] += split;
-                       level_end = BTREE_MAX_DEPTH;
-                       break;
-               }
-
-               /*
-                * Always check for space for two keys, even if we won't have to
-                * split at prior level - it might have been a merge instead:
-                */
-               if (bch2_btree_node_insert_fits(path->l[level_end].b,
-                                               BKEY_BTREE_PTR_U64s_MAX * 2))
-                       break;
-
-               split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
-       }
-
-       if (!down_read_trylock(&c->gc_lock)) {
-               ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
-               if (ret) {
-                       up_read(&c->gc_lock);
-                       return ERR_PTR(ret);
-               }
-       }
-
-       as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
-       memset(as, 0, sizeof(*as));
-       closure_init(&as->cl, NULL);
-       as->c                   = c;
-       as->start_time          = start_time;
-       as->ip_started          = _RET_IP_;
-       as->mode                = BTREE_UPDATE_none;
-       as->flags               = flags;
-       as->took_gc_lock        = true;
-       as->btree_id            = path->btree_id;
-       as->update_level_start  = level_start;
-       as->update_level_end    = level_end;
-       INIT_LIST_HEAD(&as->list);
-       INIT_LIST_HEAD(&as->unwritten_list);
-       INIT_LIST_HEAD(&as->write_blocked_list);
-       bch2_keylist_init(&as->old_keys, as->_old_keys);
-       bch2_keylist_init(&as->new_keys, as->_new_keys);
-       bch2_keylist_init(&as->parent_keys, as->inline_keys);
-
-       mutex_lock(&c->btree_interior_update_lock);
-       list_add_tail(&as->list, &c->btree_interior_update_list);
-       mutex_unlock(&c->btree_interior_update_lock);
-
-       struct btree *b = btree_path_node(path, path->level);
-       as->node_start  = b->data->min_key;
-       as->node_end    = b->data->max_key;
-       as->node_needed_rewrite = btree_node_rewrite_reason(b);
-       as->node_written = b->written;
-       as->node_sectors = btree_buf_bytes(b) >> 9;
-       as->node_remaining = __bch2_btree_u64s_remaining(b,
-                               btree_bkey_last(b, bset_tree_last(b)));
-
-       /*
-        * We don't want to allocate if we're in an error state, that can cause
-        * deadlock on emergency shutdown due to open buckets getting stuck in
-        * the btree_reserve_cache after allocator shutdown has cleared it out.
-        * This check needs to come after adding us to the btree_interior_update
-        * list but before calling bch2_btree_reserve_get, to synchronize with
-        * __bch2_fs_read_only().
-        */
-       ret = bch2_journal_error(&c->journal);
-       if (ret)
-               goto err;
-
-       ret = bch2_disk_reservation_get(c, &as->disk_res,
-                       (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
-                       READ_ONCE(c->opts.metadata_replicas),
-                       disk_res_flags);
-       if (ret)
-               goto err;
-
-       ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL);
-       if (bch2_err_matches(ret, ENOSPC) ||
-           bch2_err_matches(ret, ENOMEM)) {
-               struct closure cl;
-
-               /*
-                * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
-                * flag
-                */
-               if (bch2_err_matches(ret, ENOSPC) &&
-                   (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-                   watermark < BCH_WATERMARK_reclaim) {
-                       ret = bch_err_throw(c, journal_reclaim_would_deadlock);
-                       goto err;
-               }
-
-               closure_init_stack(&cl);
-
-               do {
-                       ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl);
-                       if (!bch2_err_matches(ret, BCH_ERR_operation_blocked))
-                               break;
-                       bch2_trans_unlock(trans);
-                       bch2_wait_on_allocator(c, &cl);
-               } while (1);
-       }
-
-       if (ret) {
-               trace_and_count(c, btree_reserve_get_fail, trans->fn,
-                               _RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
-               goto err;
-       }
-
-       ret = bch2_trans_relock(trans);
-       if (ret)
-               goto err;
-
-       bch2_trans_verify_not_restarted(trans, restart_count);
-       return as;
-err:
-       bch2_btree_update_free(as, trans);
-       if (!bch2_err_matches(ret, ENOSPC) &&
-           !bch2_err_matches(ret, EROFS) &&
-           ret != -BCH_ERR_journal_reclaim_would_deadlock &&
-           ret != -BCH_ERR_journal_shutdown)
-               bch_err_fn_ratelimited(c, ret);
-       return ERR_PTR(ret);
-}
-
-/* Btree root updates: */
-
-static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
-{
-       /* Root nodes cannot be reaped */
-       mutex_lock(&c->btree_cache.lock);
-       list_del_init(&b->list);
-       mutex_unlock(&c->btree_cache.lock);
-
-       mutex_lock(&c->btree_root_lock);
-       bch2_btree_id_root(c, b->c.btree_id)->b = b;
-       mutex_unlock(&c->btree_root_lock);
-
-       bch2_recalc_btree_reserve(c);
-}
-
-static int bch2_btree_set_root(struct btree_update *as,
-                              struct btree_trans *trans,
-                              struct btree_path *path,
-                              struct btree *b,
-                              bool nofail)
-{
-       struct bch_fs *c = as->c;
-
-       trace_and_count(c, btree_node_set_root, trans, b);
-
-       struct btree *old = btree_node_root(c, b);
-
-       /*
-        * Ensure no one is using the old root while we switch to the
-        * new root:
-        */
-       if (nofail) {
-               bch2_btree_node_lock_write_nofail(trans, path, &old->c);
-       } else {
-               int ret = bch2_btree_node_lock_write(trans, path, &old->c);
-               if (ret)
-                       return ret;
-       }
-
-       bch2_btree_set_root_inmem(c, b);
-
-       btree_update_updated_root(as, b);
-
-       /*
-        * Unlock old root after new root is visible:
-        *
-        * The new root isn't persistent, but that's ok: we still have
-        * an intent lock on the new root, and any updates that would
-        * depend on the new root would have to update the new root.
-        */
-       bch2_btree_node_unlock_write(trans, path, old);
-       return 0;
-}
-
-/* Interior node updates: */
-
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
-                                       struct btree_trans *trans,
-                                       struct btree_path *path,
-                                       struct btree *b,
-                                       struct btree_node_iter *node_iter,
-                                       struct bkey_i *insert)
-{
-       struct bch_fs *c = as->c;
-       struct bkey_packed *k;
-       struct printbuf buf = PRINTBUF;
-       unsigned long old, new;
-
-       BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
-              !btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
-
-       if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
-               bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
-
-       struct bkey_validate_context from = (struct bkey_validate_context) {
-               .from   = BKEY_VALIDATE_btree_node,
-               .level  = b->c.level,
-               .btree  = b->c.btree_id,
-               .flags  = BCH_VALIDATE_commit,
-       };
-       if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?:
-           bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) {
-               bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
-               dump_stack();
-       }
-
-       BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
-              ARRAY_SIZE(as->journal_entries));
-
-       as->journal_u64s +=
-               journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
-                                 BCH_JSET_ENTRY_btree_keys,
-                                 b->c.btree_id, b->c.level,
-                                 insert, insert->k.u64s);
-
-       while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
-              bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
-               bch2_btree_node_iter_advance(node_iter, b);
-
-       bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
-       set_btree_node_dirty_acct(c, b);
-
-       old = READ_ONCE(b->flags);
-       do {
-               new = old;
-
-               new &= ~BTREE_WRITE_TYPE_MASK;
-               new |= BTREE_WRITE_interior;
-               new |= 1 << BTREE_NODE_need_write;
-       } while (!try_cmpxchg(&b->flags, &old, new));
-
-       printbuf_exit(&buf);
-}
-
-static int
-bch2_btree_insert_keys_interior(struct btree_update *as,
-                               struct btree_trans *trans,
-                               struct btree_path *path,
-                               struct btree *b,
-                               struct btree_node_iter node_iter,
-                               struct keylist *keys)
-{
-       struct bkey_i *insert = bch2_keylist_front(keys);
-       struct bkey_packed *k;
-
-       BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
-
-       while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
-              (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
-               ;
-
-       for (;
-            insert != keys->top && bpos_le(insert->k.p, b->key.k.p);
-            insert = bkey_next(insert))
-               bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
-
-       int ret = bch2_btree_node_check_topology(trans, b);
-       if (ret) {
-               struct printbuf buf = PRINTBUF;
-
-               for (struct bkey_i *k = keys->keys;
-                    k != insert;
-                    k = bkey_next(k)) {
-                       bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
-                       prt_newline(&buf);
-               }
-
-               bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s",
-                                   (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf);
-               dump_stack();
-               return ret;
-       }
-
-       memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
-       keys->top_p -= insert->_data - keys->keys_p;
-       return 0;
-}
-
-static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
-{
-       if (insert_keys)
-               for_each_keylist_key(insert_keys, k)
-                       if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos))
-                               return true;
-       return false;
-}
-
-/*
- * Move keys from n1 (original replacement node, now lower node) to n2 (higher
- * node)
- */
-static void __btree_split_node(struct btree_update *as,
-                              struct btree_trans *trans,
-                              struct btree *b,
-                              struct btree *n[2],
-                              struct keylist *insert_keys)
-{
-       struct bkey_packed *k;
-       struct bpos n1_pos = POS_MIN;
-       struct btree_node_iter iter;
-       struct bset *bsets[2];
-       struct bkey_format_state format[2];
-       struct bkey_packed *out[2];
-       struct bkey uk;
-       unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
-       struct { unsigned nr_keys, val_u64s; } nr_keys[2];
-       int i;
-
-       memset(&nr_keys, 0, sizeof(nr_keys));
-
-       for (i = 0; i < 2; i++) {
-               BUG_ON(n[i]->nsets != 1);
-
-               bsets[i] = btree_bset_first(n[i]);
-               out[i] = bsets[i]->start;
-
-               SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
-               bch2_bkey_format_init(&format[i]);
-       }
-
-       u64s = 0;
-       for_each_btree_node_key(b, k, &iter) {
-               if (bkey_deleted(k))
-                       continue;
-
-               uk = bkey_unpack_key(b, k);
-
-               if (b->c.level &&
-                   u64s < n1_u64s &&
-                   u64s + k->u64s >= n1_u64s &&
-                   (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) ||
-                    key_deleted_in_insert(insert_keys, uk.p)))
-                       n1_u64s += k->u64s;
-
-               i = u64s >= n1_u64s;
-               u64s += k->u64s;
-               if (!i)
-                       n1_pos = uk.p;
-               bch2_bkey_format_add_key(&format[i], &uk);
-
-               nr_keys[i].nr_keys++;
-               nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k);
-       }
-
-       btree_set_min(n[0], b->data->min_key);
-       btree_set_max(n[0], n1_pos);
-       btree_set_min(n[1], bpos_successor(n1_pos));
-       btree_set_max(n[1], b->data->max_key);
-
-       for (i = 0; i < 2; i++) {
-               bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
-               bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
-
-               n[i]->data->format = bch2_bkey_format_done(&format[i]);
-
-               unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
-                       nr_keys[i].val_u64s;
-               if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
-                       n[i]->data->format = b->format;
-
-               btree_node_set_format(n[i], n[i]->data->format);
-       }
-
-       u64s = 0;
-       for_each_btree_node_key(b, k, &iter) {
-               if (bkey_deleted(k))
-                       continue;
-
-               i = u64s >= n1_u64s;
-               u64s += k->u64s;
-
-               if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
-                                       ? &b->format: &bch2_bkey_format_current, k))
-                       out[i]->format = KEY_FORMAT_LOCAL_BTREE;
-               else
-                       bch2_bkey_unpack(b, (void *) out[i], k);
-
-               out[i]->needs_whiteout = false;
-
-               btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
-               out[i] = bkey_p_next(out[i]);
-       }
-
-       for (i = 0; i < 2; i++) {
-               bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
-
-               BUG_ON(!bsets[i]->u64s);
-
-               set_btree_bset_end(n[i], n[i]->set);
-
-               btree_node_reset_sib_u64s(n[i]);
-
-               bch2_verify_btree_nr_keys(n[i]);
-
-               BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
-       }
-}
-
-/*
- * For updates to interior nodes, we've got to do the insert before we split
- * because the stuff we're inserting has to be inserted atomically. Post split,
- * the keys might have to go in different nodes and the split would no longer be
- * atomic.
- *
- * Worse, if the insert is from btree node coalescing, if we do the insert after
- * we do the split (and pick the pivot) - the pivot we pick might be between
- * nodes that were coalesced, and thus in the middle of a child node post
- * coalescing:
- */
-static int btree_split_insert_keys(struct btree_update *as,
-                                  struct btree_trans *trans,
-                                  btree_path_idx_t path_idx,
-                                  struct btree *b,
-                                  struct keylist *keys)
-{
-       struct btree_path *path = trans->paths + path_idx;
-
-       if (!bch2_keylist_empty(keys) &&
-           bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
-               struct btree_node_iter node_iter;
-
-               bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
-
-               int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static int btree_split(struct btree_update *as, struct btree_trans *trans,
-                      btree_path_idx_t path, struct btree *b,
-                      struct keylist *keys)
-{
-       struct bch_fs *c = as->c;
-       struct btree *parent = btree_node_parent(trans->paths + path, b);
-       struct btree *n1, *n2 = NULL, *n3 = NULL;
-       btree_path_idx_t path1 = 0, path2 = 0;
-       u64 start_time = local_clock();
-       int ret = 0;
-
-       bch2_verify_btree_nr_keys(b);
-       BUG_ON(!parent && (b != btree_node_root(c, b)));
-       BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
-
-       ret = bch2_btree_node_check_topology(trans, b);
-       if (ret)
-               return ret;
-
-       if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
-               struct btree *n[2];
-
-               trace_and_count(c, btree_node_split, trans, b);
-
-               n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
-               n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
-
-               __btree_split_node(as, trans, b, n, keys);
-
-               if (keys) {
-                       ret =   btree_split_insert_keys(as, trans, path, n1, keys) ?:
-                               btree_split_insert_keys(as, trans, path, n2, keys);
-                       if (ret)
-                               goto err;
-                       BUG_ON(!bch2_keylist_empty(keys));
-               }
-
-               bch2_btree_build_aux_trees(n2);
-               bch2_btree_build_aux_trees(n1);
-
-               bch2_btree_update_add_new_node(as, n1);
-               bch2_btree_update_add_new_node(as, n2);
-               six_unlock_write(&n2->c.lock);
-               six_unlock_write(&n1->c.lock);
-
-               path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
-               six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-               mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
-               bch2_btree_path_level_init(trans, trans->paths + path1, n1);
-
-               path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p);
-               six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
-               mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
-               bch2_btree_path_level_init(trans, trans->paths + path2, n2);
-
-               /*
-                * Note that on recursive parent_keys == keys, so we
-                * can't start adding new keys to parent_keys before emptying it
-                * out (which we did with btree_split_insert_keys() above)
-                */
-               bch2_keylist_add(&as->parent_keys, &n1->key);
-               bch2_keylist_add(&as->parent_keys, &n2->key);
-
-               if (!parent) {
-                       /* Depth increases, make a new root */
-                       n3 = __btree_root_alloc(as, trans, b->c.level + 1);
-
-                       bch2_btree_update_add_new_node(as, n3);
-                       six_unlock_write(&n3->c.lock);
-
-                       trans->paths[path2].locks_want++;
-                       BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
-                       six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
-                       mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
-                       bch2_btree_path_level_init(trans, trans->paths + path2, n3);
-
-                       n3->sib_u64s[0] = U16_MAX;
-                       n3->sib_u64s[1] = U16_MAX;
-
-                       ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
-                       if (ret)
-                               goto err;
-               }
-       } else {
-               trace_and_count(c, btree_node_compact, trans, b);
-
-               n1 = bch2_btree_node_alloc_replacement(as, trans, b);
-
-               if (keys) {
-                       ret = btree_split_insert_keys(as, trans, path, n1, keys);
-                       if (ret)
-                               goto err;
-                       BUG_ON(!bch2_keylist_empty(keys));
-               }
-
-               bch2_btree_build_aux_trees(n1);
-               bch2_btree_update_add_new_node(as, n1);
-               six_unlock_write(&n1->c.lock);
-
-               path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
-               six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-               mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
-               bch2_btree_path_level_init(trans, trans->paths + path1, n1);
-
-               if (parent)
-                       bch2_keylist_add(&as->parent_keys, &n1->key);
-       }
-
-       /* New nodes all written, now make them visible: */
-
-       if (parent) {
-               /* Split a non root node */
-               ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
-       } else if (n3) {
-               ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
-       } else {
-               /* Root filled up but didn't need to be split */
-               ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
-       }
-
-       if (ret)
-               goto err;
-
-       bch2_btree_interior_update_will_free_node(as, b);
-
-       if (n3) {
-               bch2_btree_update_get_open_buckets(as, n3);
-               bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0);
-       }
-       if (n2) {
-               bch2_btree_update_get_open_buckets(as, n2);
-               bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0);
-       }
-       bch2_btree_update_get_open_buckets(as, n1);
-       bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0);
-
-       /*
-        * The old node must be freed (in memory) _before_ unlocking the new
-        * nodes - else another thread could re-acquire a read lock on the old
-        * node after another thread has locked and updated the new node, thus
-        * seeing stale data:
-        */
-       bch2_btree_node_free_inmem(trans, trans->paths + path, b);
-
-       if (n3)
-               bch2_trans_node_add(trans, trans->paths + path, n3);
-       if (n2)
-               bch2_trans_node_add(trans, trans->paths + path2, n2);
-       bch2_trans_node_add(trans, trans->paths + path1, n1);
-
-       if (n3)
-               six_unlock_intent(&n3->c.lock);
-       if (n2)
-               six_unlock_intent(&n2->c.lock);
-       six_unlock_intent(&n1->c.lock);
-out:
-       if (path2) {
-               __bch2_btree_path_unlock(trans, trans->paths + path2);
-               bch2_path_put(trans, path2, true);
-       }
-       if (path1) {
-               __bch2_btree_path_unlock(trans, trans->paths + path1);
-               bch2_path_put(trans, path1, true);
-       }
-
-       bch2_trans_verify_locks(trans);
-
-       bch2_time_stats_update(&c->times[n2
-                              ? BCH_TIME_btree_node_split
-                              : BCH_TIME_btree_node_compact],
-                              start_time);
-       return ret;
-err:
-       if (n3)
-               bch2_btree_node_free_never_used(as, trans, n3);
-       if (n2)
-               bch2_btree_node_free_never_used(as, trans, n2);
-       bch2_btree_node_free_never_used(as, trans, n1);
-       goto out;
-}
-
-/**
- * bch2_btree_insert_node - insert bkeys into a given btree node
- *
- * @as:                        btree_update object
- * @trans:             btree_trans object
- * @path_idx:          path that points to current node
- * @b:                 node to insert keys into
- * @keys:              list of keys to insert
- *
- * Returns: 0 on success, typically transaction restart error on failure
- *
- * Inserts as many keys as it can into a given btree node, splitting it if full.
- * If a split occurred, this function will return early. This can only happen
- * for leaf nodes -- inserts into interior nodes have to be atomic.
- */
-static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
-                                 btree_path_idx_t path_idx, struct btree *b,
-                                 struct keylist *keys)
-{
-       struct bch_fs *c = as->c;
-       struct btree_path *path = trans->paths + path_idx, *linked;
-       unsigned i;
-       int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
-       int old_live_u64s = b->nr.live_u64s;
-       int live_u64s_added, u64s_added;
-       int ret;
-
-       lockdep_assert_held(&c->gc_lock);
-       BUG_ON(!b->c.level);
-       BUG_ON(!as || as->b);
-       bch2_verify_keylist_sorted(keys);
-
-       if (!btree_node_intent_locked(path, b->c.level)) {
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-               prt_printf(&buf, "%s(): node not locked at level %u\n",
-                          __func__, b->c.level);
-               bch2_btree_update_to_text(&buf, as);
-               bch2_btree_path_to_text(&buf, trans, path_idx);
-               bch2_fs_emergency_read_only2(c, &buf);
-
-               bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-               return -EIO;
-       }
-
-       ret = bch2_btree_node_lock_write(trans, path, &b->c);
-       if (ret)
-               return ret;
-
-       bch2_btree_node_prep_for_write(trans, path, b);
-
-       if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
-               bch2_btree_node_unlock_write(trans, path, b);
-               goto split;
-       }
-
-
-       ret =   bch2_btree_node_check_topology(trans, b) ?:
-               bch2_btree_insert_keys_interior(as, trans, path, b,
-                                       path->l[b->c.level].iter, keys);
-       if (ret) {
-               bch2_btree_node_unlock_write(trans, path, b);
-               return ret;
-       }
-
-       trans_for_each_path_with_node(trans, b, linked, i)
-               bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
-       bch2_trans_verify_paths(trans);
-
-       live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-       u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
-
-       if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
-               b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
-       if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
-               b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
-       if (u64s_added > live_u64s_added &&
-           bch2_maybe_compact_whiteouts(c, b))
-               bch2_trans_node_reinit_iter(trans, b);
-
-       btree_update_updated_node(as, b);
-       bch2_btree_node_unlock_write(trans, path, b);
-       return 0;
-split:
-       /*
-        * We could attempt to avoid the transaction restart, by calling
-        * bch2_btree_path_upgrade() and allocating more nodes:
-        */
-       if (b->c.level >= as->update_level_end) {
-               trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
-               return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
-       }
-
-       return btree_split(as, trans, path_idx, b, keys);
-}
-
-int bch2_btree_split_leaf(struct btree_trans *trans,
-                         btree_path_idx_t path,
-                         unsigned flags)
-{
-       /* btree_split & merge may both cause paths array to be reallocated */
-       struct btree *b = path_l(trans->paths + path)->b;
-       struct btree_update *as;
-       unsigned l;
-       int ret = 0;
-
-       as = bch2_btree_update_start(trans, trans->paths + path,
-                                    trans->paths[path].level,
-                                    true, 0, flags);
-       if (IS_ERR(as))
-               return PTR_ERR(as);
-
-       ret = btree_split(as, trans, path, b, NULL);
-       if (ret) {
-               bch2_btree_update_free(as, trans);
-               return ret;
-       }
-
-       bch2_btree_update_done(as, trans);
-
-       for (l = trans->paths[path].level + 1;
-            btree_node_intent_locked(&trans->paths[path], l) && !ret;
-            l++)
-               ret = bch2_foreground_maybe_merge(trans, path, l, flags);
-
-       return ret;
-}
-
-static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
-                                  btree_path_idx_t path_idx)
-{
-       struct bch_fs *c = as->c;
-       struct btree_path *path = trans->paths + path_idx;
-       struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
-
-       BUG_ON(!btree_node_locked(path, b->c.level));
-
-       n = __btree_root_alloc(as, trans, b->c.level + 1);
-
-       bch2_btree_update_add_new_node(as, n);
-       six_unlock_write(&n->c.lock);
-
-       path->locks_want++;
-       BUG_ON(btree_node_locked(path, n->c.level));
-       six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-       mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
-       bch2_btree_path_level_init(trans, path, n);
-
-       n->sib_u64s[0] = U16_MAX;
-       n->sib_u64s[1] = U16_MAX;
-
-       bch2_keylist_add(&as->parent_keys, &b->key);
-       btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
-
-       int ret = bch2_btree_set_root(as, trans, path, n, true);
-       BUG_ON(ret);
-
-       bch2_btree_update_get_open_buckets(as, n);
-       bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
-       bch2_trans_node_add(trans, path, n);
-       six_unlock_intent(&n->c.lock);
-
-       mutex_lock(&c->btree_cache.lock);
-       list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
-       mutex_unlock(&c->btree_cache.lock);
-
-       bch2_trans_verify_locks(trans);
-}
-
-int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
-
-       if (btree_node_fake(b))
-               return bch2_btree_split_leaf(trans, path, flags);
-
-       struct btree_update *as =
-               bch2_btree_update_start(trans, trans->paths + path, b->c.level,
-                                       true, 0, flags);
-       if (IS_ERR(as))
-               return PTR_ERR(as);
-
-       __btree_increase_depth(as, trans, path);
-       bch2_btree_update_done(as, trans);
-       return 0;
-}
-
-int __bch2_foreground_maybe_merge(struct btree_trans *trans,
-                                 btree_path_idx_t path,
-                                 unsigned level,
-                                 unsigned flags,
-                                 enum btree_node_sibling sib)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_update *as;
-       struct bkey_format_state new_s;
-       struct bkey_format new_f;
-       struct bkey_i delete;
-       struct btree *b, *m, *n, *prev, *next, *parent;
-       struct bpos sib_pos;
-       size_t sib_u64s;
-       enum btree_id btree = trans->paths[path].btree_id;
-       btree_path_idx_t sib_path = 0, new_path = 0;
-       u64 start_time = local_clock();
-       int ret = 0;
-
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-       BUG_ON(!trans->paths[path].should_be_locked);
-       BUG_ON(!btree_node_locked(&trans->paths[path], level));
-
-       /*
-        * Work around a deadlock caused by the btree write buffer not doing
-        * merges and leaving tons of merges for us to do - we really don't need
-        * to be doing merges at all from the interior update path, and if the
-        * interior update path is generating too many new interior updates we
-        * deadlock:
-        */
-       if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
-               return 0;
-
-       if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
-               flags &= ~BCH_WATERMARK_MASK;
-               flags |= BCH_WATERMARK_btree;
-               flags |= BCH_TRANS_COMMIT_journal_reclaim;
-       }
-
-       b = trans->paths[path].l[level].b;
-
-       if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
-           (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
-               b->sib_u64s[sib] = U16_MAX;
-               return 0;
-       }
-
-       sib_pos = sib == btree_prev_sib
-               ? bpos_predecessor(b->data->min_key)
-               : bpos_successor(b->data->max_key);
-
-       sib_path = bch2_path_get(trans, btree, sib_pos,
-                                U8_MAX, level, BTREE_ITER_intent, _THIS_IP_);
-       ret = bch2_btree_path_traverse(trans, sib_path, false);
-       if (ret)
-               goto err;
-
-       btree_path_set_should_be_locked(trans, trans->paths + sib_path);
-
-       m = trans->paths[sib_path].l[level].b;
-
-       if (btree_node_parent(trans->paths + path, b) !=
-           btree_node_parent(trans->paths + sib_path, m)) {
-               b->sib_u64s[sib] = U16_MAX;
-               goto out;
-       }
-
-       if (sib == btree_prev_sib) {
-               prev = m;
-               next = b;
-       } else {
-               prev = b;
-               next = m;
-       }
-
-       if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
-               struct printbuf buf = PRINTBUF;
-
-               printbuf_indent_add_nextline(&buf, 2);
-               prt_printf(&buf, "%s(): ", __func__);
-               ret = __bch2_topology_error(c, &buf);
-               prt_newline(&buf);
-
-               prt_printf(&buf, "prev ends at   ");
-               bch2_bpos_to_text(&buf, prev->data->max_key);
-               prt_newline(&buf);
-
-               prt_printf(&buf, "next starts at ");
-               bch2_bpos_to_text(&buf, next->data->min_key);
-
-               bch_err(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-               goto err;
-       }
-
-       bch2_bkey_format_init(&new_s);
-       bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
-       __bch2_btree_calc_format(&new_s, prev);
-       __bch2_btree_calc_format(&new_s, next);
-       bch2_bkey_format_add_pos(&new_s, next->data->max_key);
-       new_f = bch2_bkey_format_done(&new_s);
-
-       sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
-               btree_node_u64s_with_format(m->nr, &m->format, &new_f);
-
-       if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
-               sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
-               sib_u64s /= 2;
-               sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
-       }
-
-       sib_u64s = min(sib_u64s, btree_max_u64s(c));
-       sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
-       b->sib_u64s[sib] = sib_u64s;
-
-       if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
-               goto out;
-
-       parent = btree_node_parent(trans->paths + path, b);
-       as = bch2_btree_update_start(trans, trans->paths + path, level, false,
-                                    0, BCH_TRANS_COMMIT_no_enospc|flags);
-       ret = PTR_ERR_OR_ZERO(as);
-       if (ret)
-               goto err;
-
-       as->node_start  = prev->data->min_key;
-       as->node_end    = next->data->max_key;
-
-       trace_and_count(c, btree_node_merge, trans, b);
-
-       n = bch2_btree_node_alloc(as, trans, b->c.level);
-
-       SET_BTREE_NODE_SEQ(n->data,
-                          max(BTREE_NODE_SEQ(b->data),
-                              BTREE_NODE_SEQ(m->data)) + 1);
-
-       btree_set_min(n, prev->data->min_key);
-       btree_set_max(n, next->data->max_key);
-
-       n->data->format  = new_f;
-       btree_node_set_format(n, new_f);
-
-       bch2_btree_sort_into(c, n, prev);
-       bch2_btree_sort_into(c, n, next);
-
-       bch2_btree_build_aux_trees(n);
-       bch2_btree_update_add_new_node(as, n);
-       six_unlock_write(&n->c.lock);
-
-       new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p);
-       six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-       mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
-       bch2_btree_path_level_init(trans, trans->paths + new_path, n);
-
-       bkey_init(&delete.k);
-       delete.k.p = prev->key.k.p;
-       bch2_keylist_add(&as->parent_keys, &delete);
-       bch2_keylist_add(&as->parent_keys, &n->key);
-
-       bch2_trans_verify_paths(trans);
-
-       ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
-       if (ret)
-               goto err_free_update;
-
-       bch2_btree_interior_update_will_free_node(as, b);
-       bch2_btree_interior_update_will_free_node(as, m);
-
-       bch2_trans_verify_paths(trans);
-
-       bch2_btree_update_get_open_buckets(as, n);
-       bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
-
-       bch2_btree_node_free_inmem(trans, trans->paths + path, b);
-       bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
-
-       bch2_trans_node_add(trans, trans->paths + path, n);
-
-       bch2_trans_verify_paths(trans);
-
-       six_unlock_intent(&n->c.lock);
-
-       bch2_btree_update_done(as, trans);
-
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
-out:
-err:
-       if (new_path)
-               bch2_path_put(trans, new_path, true);
-       bch2_path_put(trans, sib_path, true);
-       bch2_trans_verify_locks(trans);
-       if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
-               ret = 0;
-       if (!ret)
-               ret = bch2_trans_relock(trans);
-       return ret;
-err_free_update:
-       bch2_btree_node_free_never_used(as, trans, n);
-       bch2_btree_update_free(as, trans);
-       goto out;
-}
-
-static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
-                           struct btree *b)
-{
-       bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
-                                 BTREE_MAX_DEPTH, b->c.level,
-                                 BTREE_ITER_intent);
-       int ret = bch2_btree_iter_traverse(trans, iter);
-       if (ret)
-               goto err;
-
-       /* has node been freed? */
-       if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
-               /* node has been freed: */
-               BUG_ON(!btree_node_dying(b));
-               ret = bch_err_throw(trans->c, btree_node_dying);
-               goto err;
-       }
-
-       BUG_ON(!btree_node_hashed(b));
-       return 0;
-err:
-       bch2_trans_iter_exit(trans, iter);
-       return ret;
-}
-
-int bch2_btree_node_rewrite(struct btree_trans *trans,
-                           struct btree_iter *iter,
-                           struct btree *b,
-                           unsigned target,
-                           unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct btree *n, *parent;
-       struct btree_update *as;
-       btree_path_idx_t new_path = 0;
-       int ret;
-
-       flags |= BCH_TRANS_COMMIT_no_enospc;
-
-       struct btree_path *path = btree_iter_path(trans, iter);
-       parent = btree_node_parent(path, b);
-       as = bch2_btree_update_start(trans, path, b->c.level,
-                                    false, target, flags);
-       ret = PTR_ERR_OR_ZERO(as);
-       if (ret)
-               goto out;
-
-       n = bch2_btree_node_alloc_replacement(as, trans, b);
-
-       bch2_btree_build_aux_trees(n);
-       bch2_btree_update_add_new_node(as, n);
-       six_unlock_write(&n->c.lock);
-
-       new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p);
-       six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-       mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
-       bch2_btree_path_level_init(trans, trans->paths + new_path, n);
-
-       trace_and_count(c, btree_node_rewrite, trans, b);
-
-       if (parent) {
-               bch2_keylist_add(&as->parent_keys, &n->key);
-               ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
-       } else {
-               ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
-       }
-
-       if (ret)
-               goto err;
-
-       bch2_btree_interior_update_will_free_node(as, b);
-
-       bch2_btree_update_get_open_buckets(as, n);
-       bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
-
-       bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
-
-       bch2_trans_node_add(trans, trans->paths + iter->path, n);
-       six_unlock_intent(&n->c.lock);
-
-       bch2_btree_update_done(as, trans);
-out:
-       if (new_path)
-               bch2_path_put(trans, new_path, true);
-       bch2_trans_downgrade(trans);
-       return ret;
-err:
-       bch2_btree_node_free_never_used(as, trans, n);
-       bch2_btree_update_free(as, trans);
-       goto out;
-}
-
-int bch2_btree_node_rewrite_key(struct btree_trans *trans,
-                               enum btree_id btree, unsigned level,
-                               struct bkey_i *k, unsigned flags)
-{
-       struct btree_iter iter;
-       bch2_trans_node_iter_init(trans, &iter,
-                                 btree, k->k.p,
-                                 BTREE_MAX_DEPTH, level, 0);
-       struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
-       int ret = PTR_ERR_OR_ZERO(b);
-       if (ret)
-               goto out;
-
-       bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
-       ret = found
-               ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags)
-               : -ENOENT;
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
-                               enum btree_id btree, unsigned level,
-                               struct bpos pos,
-                               unsigned target,
-                               unsigned flags)
-{
-       BUG_ON(!level);
-
-       /* Traverse one depth lower to get a pointer to the node itself: */
-       struct btree_iter iter;
-       bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
-       struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
-       int ret = PTR_ERR_OR_ZERO(b);
-       if (ret)
-               goto err;
-
-       ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
-                                        struct btree *b, unsigned flags)
-{
-       struct btree_iter iter;
-       int ret = get_iter_to_node(trans, &iter, b);
-       if (ret)
-               return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
-
-       ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-struct async_btree_rewrite {
-       struct bch_fs           *c;
-       struct work_struct      work;
-       struct list_head        list;
-       enum btree_id           btree_id;
-       unsigned                level;
-       struct bkey_buf         key;
-};
-
-static void async_btree_node_rewrite_work(struct work_struct *work)
-{
-       struct async_btree_rewrite *a =
-               container_of(work, struct async_btree_rewrite, work);
-       struct bch_fs *c = a->c;
-
-       int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
-                                               a->btree_id, a->level, a->key.k, 0));
-       if (!bch2_err_matches(ret, ENOENT) &&
-           !bch2_err_matches(ret, EROFS))
-               bch_err_fn_ratelimited(c, ret);
-
-       spin_lock(&c->btree_node_rewrites_lock);
-       list_del(&a->list);
-       spin_unlock(&c->btree_node_rewrites_lock);
-
-       closure_wake_up(&c->btree_node_rewrites_wait);
-
-       bch2_bkey_buf_exit(&a->key, c);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite);
-       kfree(a);
-}
-
-void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
-{
-       struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
-       if (!a)
-               return;
-
-       a->c            = c;
-       a->btree_id     = b->c.btree_id;
-       a->level        = b->c.level;
-       INIT_WORK(&a->work, async_btree_node_rewrite_work);
-
-       bch2_bkey_buf_init(&a->key);
-       bch2_bkey_buf_copy(&a->key, c, &b->key);
-
-       bool now = false, pending = false;
-
-       spin_lock(&c->btree_node_rewrites_lock);
-       if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) &&
-           enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) {
-               list_add(&a->list, &c->btree_node_rewrites);
-               now = true;
-       } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
-               list_add(&a->list, &c->btree_node_rewrites_pending);
-               pending = true;
-       }
-       spin_unlock(&c->btree_node_rewrites_lock);
-
-       if (now) {
-               queue_work(c->btree_node_rewrite_worker, &a->work);
-       } else if (pending) {
-               /* bch2_do_pending_node_rewrites will execute */
-       } else {
-               bch2_bkey_buf_exit(&a->key, c);
-               kfree(a);
-       }
-}
-
-void bch2_async_btree_node_rewrites_flush(struct bch_fs *c)
-{
-       closure_wait_event(&c->btree_node_rewrites_wait,
-                          list_empty(&c->btree_node_rewrites));
-}
-
-void bch2_do_pending_node_rewrites(struct bch_fs *c)
-{
-       while (1) {
-               spin_lock(&c->btree_node_rewrites_lock);
-               struct async_btree_rewrite *a =
-                       list_pop_entry(&c->btree_node_rewrites_pending,
-                                      struct async_btree_rewrite, list);
-               if (a)
-                       list_add(&a->list, &c->btree_node_rewrites);
-               spin_unlock(&c->btree_node_rewrites_lock);
-
-               if (!a)
-                       break;
-
-               enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite);
-               queue_work(c->btree_node_rewrite_worker, &a->work);
-       }
-}
-
-void bch2_free_pending_node_rewrites(struct bch_fs *c)
-{
-       while (1) {
-               spin_lock(&c->btree_node_rewrites_lock);
-               struct async_btree_rewrite *a =
-                       list_pop_entry(&c->btree_node_rewrites_pending,
-                                      struct async_btree_rewrite, list);
-               spin_unlock(&c->btree_node_rewrites_lock);
-
-               if (!a)
-                       break;
-
-               bch2_bkey_buf_exit(&a->key, c);
-               kfree(a);
-       }
-}
-
-static int __bch2_btree_node_update_key(struct btree_trans *trans,
-                                       struct btree_iter *iter,
-                                       struct btree *b, struct btree *new_hash,
-                                       struct bkey_i *new_key,
-                                       unsigned commit_flags,
-                                       bool skip_triggers)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter2 = {};
-       struct btree *parent;
-       int ret;
-
-       if (!skip_triggers) {
-               ret   = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
-                                            bkey_i_to_s_c(&b->key),
-                                            BTREE_TRIGGER_transactional) ?:
-                       bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
-                                            bkey_i_to_s(new_key),
-                                            BTREE_TRIGGER_transactional);
-               if (ret)
-                       return ret;
-       }
-
-       if (new_hash) {
-               bkey_copy(&new_hash->key, new_key);
-               ret = bch2_btree_node_hash_insert(&c->btree_cache,
-                               new_hash, b->c.level, b->c.btree_id);
-               BUG_ON(ret);
-       }
-
-       parent = btree_node_parent(btree_iter_path(trans, iter), b);
-       if (parent) {
-               bch2_trans_copy_iter(trans, &iter2, iter);
-
-               iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
-                               iter2.flags & BTREE_ITER_intent,
-                               _THIS_IP_);
-
-               struct btree_path *path2 = btree_iter_path(trans, &iter2);
-               BUG_ON(path2->level != b->c.level);
-               BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
-
-               btree_path_set_level_up(trans, path2);
-
-               trans->paths_sorted = false;
-
-               ret   = bch2_btree_iter_traverse(trans, &iter2) ?:
-                       bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
-               if (ret)
-                       goto err;
-       } else {
-               BUG_ON(btree_node_root(c, b) != b);
-
-               struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
-                                      jset_u64s(new_key->k.u64s));
-               ret = PTR_ERR_OR_ZERO(e);
-               if (ret)
-                       return ret;
-
-               journal_entry_set(e,
-                                 BCH_JSET_ENTRY_btree_root,
-                                 b->c.btree_id, b->c.level,
-                                 new_key, new_key->k.u64s);
-       }
-
-       ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
-       if (ret)
-               goto err;
-
-       bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
-
-       if (new_hash) {
-               mutex_lock(&c->btree_cache.lock);
-               bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
-
-               __bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-               bkey_copy(&b->key, new_key);
-               ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-               BUG_ON(ret);
-               mutex_unlock(&c->btree_cache.lock);
-       } else {
-               bkey_copy(&b->key, new_key);
-       }
-
-       bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
-out:
-       bch2_trans_iter_exit(trans, &iter2);
-       return ret;
-err:
-       if (new_hash) {
-               mutex_lock(&c->btree_cache.lock);
-               bch2_btree_node_hash_remove(&c->btree_cache, b);
-               mutex_unlock(&c->btree_cache.lock);
-       }
-       goto out;
-}
-
-int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
-                              struct btree *b, struct bkey_i *new_key,
-                              unsigned commit_flags, bool skip_triggers)
-{
-       struct bch_fs *c = trans->c;
-       struct btree *new_hash = NULL;
-       struct btree_path *path = btree_iter_path(trans, iter);
-       struct closure cl;
-       int ret = 0;
-
-       ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
-       if (ret)
-               return ret;
-
-       closure_init_stack(&cl);
-
-       /*
-        * check btree_ptr_hash_val() after @b is locked by
-        * btree_iter_traverse():
-        */
-       if (btree_ptr_hash_val(new_key) != b->hash_val) {
-               ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-               if (ret) {
-                       ret = drop_locks_do(trans, (closure_sync(&cl), 0));
-                       if (ret)
-                               return ret;
-               }
-
-               new_hash = bch2_btree_node_mem_alloc(trans, false);
-               ret = PTR_ERR_OR_ZERO(new_hash);
-               if (ret)
-                       goto err;
-       }
-
-       path->intent_ref++;
-       ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
-                                          commit_flags, skip_triggers);
-       --path->intent_ref;
-
-       if (new_hash)
-               bch2_btree_node_to_freelist(c, new_hash);
-err:
-       closure_sync(&cl);
-       bch2_btree_cache_cannibalize_unlock(trans);
-       return ret;
-}
-
-int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
-                                       struct btree *b, struct bkey_i *new_key,
-                                       unsigned commit_flags, bool skip_triggers)
-{
-       struct btree_iter iter;
-       int ret = get_iter_to_node(trans, &iter, b);
-       if (ret)
-               return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
-
-       bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
-                           !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
-
-       ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
-                                        commit_flags, skip_triggers);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/* Init code: */
-
-/*
- * Only for filesystem bringup, when first reading the btree roots or allocating
- * btree roots when initializing a new filesystem:
- */
-void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
-{
-       BUG_ON(btree_node_root(c, b));
-
-       bch2_btree_set_root_inmem(c, b);
-}
-
-int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level)
-{
-       struct bch_fs *c = trans->c;
-       struct closure cl;
-       struct btree *b;
-       int ret;
-
-       closure_init_stack(&cl);
-
-       do {
-               ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-               closure_sync(&cl);
-       } while (ret);
-
-       b = bch2_btree_node_mem_alloc(trans, false);
-       bch2_btree_cache_cannibalize_unlock(trans);
-
-       ret = PTR_ERR_OR_ZERO(b);
-       if (ret)
-               return ret;
-
-       set_btree_node_fake(b);
-       set_btree_node_need_rewrite(b);
-       b->c.level      = level;
-       b->c.btree_id   = id;
-
-       bkey_btree_ptr_init(&b->key);
-       b->key.k.p = SPOS_MAX;
-       *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
-
-       bch2_bset_init_first(b, &b->data->keys);
-       bch2_btree_build_aux_trees(b);
-
-       b->data->flags = 0;
-       btree_set_min(b, POS_MIN);
-       btree_set_max(b, SPOS_MAX);
-       b->data->format = bch2_btree_calc_format(b);
-       btree_node_set_format(b, b->data->format);
-
-       ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
-                                         b->c.level, b->c.btree_id);
-       BUG_ON(ret);
-
-       bch2_btree_set_root_inmem(c, b);
-
-       six_unlock_write(&b->c.lock);
-       six_unlock_intent(&b->c.lock);
-       return 0;
-}
-
-void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
-{
-       bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
-}
-
-static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
-{
-       prt_printf(out, "%ps: ", (void *) as->ip_started);
-       bch2_trans_commit_flags_to_text(out, as->flags);
-
-       prt_str(out, " ");
-       bch2_btree_id_to_text(out, as->btree_id);
-       prt_printf(out, " l=%u-%u ",
-                  as->update_level_start,
-                  as->update_level_end);
-       bch2_bpos_to_text(out, as->node_start);
-       prt_char(out, ' ');
-       bch2_bpos_to_text(out, as->node_end);
-       prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s",
-                  as->node_written,
-                  as->node_sectors,
-                  as->node_remaining,
-                  btree_node_reawrite_reason_strs[as->node_needed_rewrite]);
-
-       prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
-                  bch2_btree_update_modes[as->mode],
-                  as->nodes_written,
-                  closure_nr_remaining(&as->cl),
-                  as->journal.seq);
-}
-
-void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       struct btree_update *as;
-
-       mutex_lock(&c->btree_interior_update_lock);
-       list_for_each_entry(as, &c->btree_interior_update_list, list)
-               bch2_btree_update_to_text(out, as);
-       mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
-{
-       bool ret;
-
-       mutex_lock(&c->btree_interior_update_lock);
-       ret = !list_empty(&c->btree_interior_update_list);
-       mutex_unlock(&c->btree_interior_update_lock);
-
-       return ret;
-}
-
-bool bch2_btree_interior_updates_flush(struct bch_fs *c)
-{
-       bool ret = bch2_btree_interior_updates_pending(c);
-
-       if (ret)
-               closure_wait_event(&c->btree_interior_update_wait,
-                                  !bch2_btree_interior_updates_pending(c));
-       return ret;
-}
-
-void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
-{
-       struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
-
-       mutex_lock(&c->btree_root_lock);
-
-       r->level = entry->level;
-       r->alive = true;
-       bkey_copy(&r->key, (struct bkey_i *) entry->start);
-
-       mutex_unlock(&c->btree_root_lock);
-}
-
-struct jset_entry *
-bch2_btree_roots_to_journal_entries(struct bch_fs *c,
-                                   struct jset_entry *end,
-                                   unsigned long skip)
-{
-       unsigned i;
-
-       mutex_lock(&c->btree_root_lock);
-
-       for (i = 0; i < btree_id_nr_alive(c); i++) {
-               struct btree_root *r = bch2_btree_id_root(c, i);
-
-               if (r->alive && !test_bit(i, &skip)) {
-                       journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
-                                         i, r->level, &r->key, r->key.k.u64s);
-                       end = vstruct_next(end);
-               }
-       }
-
-       mutex_unlock(&c->btree_root_lock);
-
-       return end;
-}
-
-static void bch2_btree_alloc_to_text(struct printbuf *out,
-                                    struct bch_fs *c,
-                                    struct btree_alloc *a)
-{
-       printbuf_indent_add(out, 2);
-       bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k));
-       prt_newline(out);
-
-       struct open_bucket *ob;
-       unsigned i;
-       open_bucket_for_each(c, &a->ob, ob, i)
-               bch2_open_bucket_to_text(out, c, ob);
-
-       printbuf_indent_sub(out, 2);
-}
-
-void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++)
-               bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]);
-}
-
-void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
-{
-       WARN_ON(!list_empty(&c->btree_node_rewrites));
-       WARN_ON(!list_empty(&c->btree_node_rewrites_pending));
-
-       if (c->btree_node_rewrite_worker)
-               destroy_workqueue(c->btree_node_rewrite_worker);
-       if (c->btree_interior_update_worker)
-               destroy_workqueue(c->btree_interior_update_worker);
-       mempool_exit(&c->btree_interior_update_pool);
-}
-
-void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
-{
-       mutex_init(&c->btree_reserve_cache_lock);
-       INIT_LIST_HEAD(&c->btree_interior_update_list);
-       INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
-       mutex_init(&c->btree_interior_update_lock);
-       INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
-
-       INIT_LIST_HEAD(&c->btree_node_rewrites);
-       INIT_LIST_HEAD(&c->btree_node_rewrites_pending);
-       spin_lock_init(&c->btree_node_rewrites_lock);
-}
-
-int bch2_fs_btree_interior_update_init(struct bch_fs *c)
-{
-       c->btree_interior_update_worker =
-               alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
-       if (!c->btree_interior_update_worker)
-               return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
-
-       c->btree_node_rewrite_worker =
-               alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
-       if (!c->btree_node_rewrite_worker)
-               return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
-
-       if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
-                                     sizeof(struct btree_update)))
-               return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init);
-
-       return 0;
-}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
deleted file mode 100644 (file)
index ac04e45..0000000
+++ /dev/null
@@ -1,364 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
-#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
-
-#include "btree_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-
-#define BTREE_UPDATE_NODES_MAX         ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
-
-#define BTREE_UPDATE_JOURNAL_RES       (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
-
-int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
-
-#define BTREE_UPDATE_MODES()   \
-       x(none)                 \
-       x(node)                 \
-       x(root)                 \
-       x(update)
-
-enum btree_update_mode {
-#define x(n)   BTREE_UPDATE_##n,
-       BTREE_UPDATE_MODES()
-#undef x
-};
-
-/*
- * Tracks an in progress split/rewrite of a btree node and the update to the
- * parent node:
- *
- * When we split/rewrite a node, we do all the updates in memory without
- * waiting for any writes to complete - we allocate the new node(s) and update
- * the parent node, possibly recursively up to the root.
- *
- * The end result is that we have one or more new nodes being written -
- * possibly several, if there were multiple splits - and then a write (updating
- * an interior node) which will make all these new nodes visible.
- *
- * Additionally, as we split/rewrite nodes we free the old nodes - but the old
- * nodes can't be freed (their space on disk can't be reclaimed) until the
- * update to the interior node that makes the new node visible completes -
- * until then, the old nodes are still reachable on disk.
- *
- */
-struct btree_update {
-       struct closure                  cl;
-       struct bch_fs                   *c;
-       u64                             start_time;
-       unsigned long                   ip_started;
-
-       struct list_head                list;
-       struct list_head                unwritten_list;
-
-       enum btree_update_mode          mode;
-       enum bch_trans_commit_flags     flags;
-       unsigned                        nodes_written:1;
-       unsigned                        took_gc_lock:1;
-
-       enum btree_id                   btree_id;
-       struct bpos                     node_start;
-       struct bpos                     node_end;
-       enum btree_node_rewrite_reason  node_needed_rewrite;
-       u16                             node_written;
-       u16                             node_sectors;
-       u16                             node_remaining;
-
-       unsigned                        update_level_start;
-       unsigned                        update_level_end;
-
-       struct disk_reservation         disk_res;
-
-       /*
-        * BTREE_UPDATE_node:
-        * The update that made the new nodes visible was a regular update to an
-        * existing interior node - @b. We can't write out the update to @b
-        * until the new nodes we created are finished writing, so we block @b
-        * from writing by putting this btree_interior update on the
-        * @b->write_blocked list with @write_blocked_list:
-        */
-       struct btree                    *b;
-       struct list_head                write_blocked_list;
-
-       /*
-        * We may be freeing nodes that were dirty, and thus had journal entries
-        * pinned: we need to transfer the oldest of those pins to the
-        * btree_update operation, and release it when the new node(s)
-        * are all persistent and reachable:
-        */
-       struct journal_entry_pin        journal;
-
-       /* Preallocated nodes we reserve when we start the update: */
-       struct prealloc_nodes {
-               struct btree            *b[BTREE_UPDATE_NODES_MAX];
-               unsigned                nr;
-       }                               prealloc_nodes[2];
-
-       /* Nodes being freed: */
-       struct keylist                  old_keys;
-       u64                             _old_keys[BTREE_UPDATE_NODES_MAX *
-                                                 BKEY_BTREE_PTR_U64s_MAX];
-
-       /* Nodes being added: */
-       struct keylist                  new_keys;
-       u64                             _new_keys[BTREE_UPDATE_NODES_MAX *
-                                                 BKEY_BTREE_PTR_U64s_MAX];
-
-       /* New nodes, that will be made reachable by this update: */
-       struct btree                    *new_nodes[BTREE_UPDATE_NODES_MAX];
-       unsigned                        nr_new_nodes;
-
-       struct btree                    *old_nodes[BTREE_UPDATE_NODES_MAX];
-       __le64                          old_nodes_seq[BTREE_UPDATE_NODES_MAX];
-       unsigned                        nr_old_nodes;
-
-       open_bucket_idx_t               open_buckets[BTREE_UPDATE_NODES_MAX *
-                                                    BCH_REPLICAS_MAX];
-       open_bucket_idx_t               nr_open_buckets;
-
-       unsigned                        journal_u64s;
-       u64                             journal_entries[BTREE_UPDATE_JOURNAL_RES];
-
-       /* Only here to reduce stack usage on recursive splits: */
-       struct keylist                  parent_keys;
-       /*
-        * Enough room for btree_split's keys without realloc - btree node
-        * pointers never have crc/compression info, so we only need to acount
-        * for the pointers for three keys
-        */
-       u64                             inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
-};
-
-struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
-                                                 struct btree_trans *,
-                                                 struct btree *,
-                                                 struct bkey_format);
-
-int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
-
-int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
-
-int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
-                                 unsigned, unsigned, enum btree_node_sibling);
-
-static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
-                                       btree_path_idx_t path_idx,
-                                       unsigned level, unsigned flags,
-                                       enum btree_node_sibling sib)
-{
-       struct btree_path *path = trans->paths + path_idx;
-       struct btree *b;
-
-       EBUG_ON(!btree_node_locked(path, level));
-
-       if (static_branch_unlikely(&bch2_btree_node_merging_disabled))
-               return 0;
-
-       b = path->l[level].b;
-       if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
-               return 0;
-
-       return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib);
-}
-
-static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
-                                             btree_path_idx_t path,
-                                             unsigned level,
-                                             unsigned flags)
-{
-       bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-       return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
-                                                   btree_prev_sib) ?:
-               bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
-                                                   btree_next_sib);
-}
-
-int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
-                           struct btree *, unsigned, unsigned);
-int bch2_btree_node_rewrite_key(struct btree_trans *,
-                               enum btree_id, unsigned,
-                               struct bkey_i *, unsigned);
-int bch2_btree_node_rewrite_pos(struct btree_trans *,
-                               enum btree_id, unsigned,
-                               struct bpos, unsigned, unsigned);
-int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
-                                        struct btree *, unsigned);
-
-void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
-
-int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
-                              struct btree *, struct bkey_i *,
-                              unsigned, bool);
-int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
-                                       struct bkey_i *, unsigned, bool);
-
-void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-
-int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned);
-void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
-
-static inline unsigned btree_update_reserve_required(struct bch_fs *c,
-                                                    struct btree *b)
-{
-       unsigned depth = btree_node_root(c, b)->c.level + 1;
-
-       /*
-        * Number of nodes we might have to allocate in a worst case btree
-        * split operation - we split all the way up to the root, then allocate
-        * a new root, unless we're already at max depth:
-        */
-       if (depth < BTREE_MAX_DEPTH)
-               return (depth - b->c.level) * 2 + 1;
-       else
-               return (depth - b->c.level) * 2 - 1;
-}
-
-static inline void btree_node_reset_sib_u64s(struct btree *b)
-{
-       b->sib_u64s[0] = b->nr.live_u64s;
-       b->sib_u64s[1] = b->nr.live_u64s;
-}
-
-static inline void *btree_data_end(struct btree *b)
-{
-       return (void *) b->data + btree_buf_bytes(b);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
-{
-       return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
-{
-       return btree_data_end(b);
-}
-
-static inline void *write_block(struct btree *b)
-{
-       return (void *) b->data + (b->written << 9);
-}
-
-static inline bool __btree_addr_written(struct btree *b, void *p)
-{
-       return p < write_block(b);
-}
-
-static inline bool bset_written(struct btree *b, struct bset *i)
-{
-       return __btree_addr_written(b, i);
-}
-
-static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
-{
-       return __btree_addr_written(b, k);
-}
-
-static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
-{
-       ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
-               b->whiteout_u64s;
-       ssize_t total = btree_buf_bytes(b) >> 3;
-
-       /* Always leave one extra u64 for bch2_varint_decode: */
-       used++;
-
-       return total - used;
-}
-
-static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
-{
-       ssize_t remaining = __bch2_btree_u64s_remaining(b,
-                               btree_bkey_last(b, bset_tree_last(b)));
-
-       BUG_ON(remaining < 0);
-
-       if (bset_written(b, btree_bset_last(b)))
-               return 0;
-
-       return remaining;
-}
-
-#define BTREE_WRITE_SET_U64s_BITS      9
-
-static inline unsigned btree_write_set_buffer(struct btree *b)
-{
-       /*
-        * Could buffer up larger amounts of keys for btrees with larger keys,
-        * pending benchmarking:
-        */
-       return 8 << BTREE_WRITE_SET_U64s_BITS;
-}
-
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
-{
-       struct bset_tree *t = bset_tree_last(b);
-       struct btree_node_entry *bne = max(write_block(b),
-                       (void *) btree_bkey_last(b, t));
-       ssize_t remaining_space =
-               __bch2_btree_u64s_remaining(b, bne->keys.start);
-
-       if (unlikely(bset_written(b, bset(b, t)))) {
-               if (b->written + block_sectors(c) <= btree_sectors(c))
-                       return bne;
-       } else {
-               if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
-                   remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
-                       return bne;
-       }
-
-       return NULL;
-}
-
-static inline void push_whiteout(struct btree *b, struct bpos pos)
-{
-       struct bkey_packed k;
-
-       BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
-       EBUG_ON(btree_node_just_written(b));
-
-       if (!bkey_pack_pos(&k, pos, b)) {
-               struct bkey *u = (void *) &k;
-
-               bkey_init(u);
-               u->p = pos;
-       }
-
-       k.needs_whiteout = true;
-
-       b->whiteout_u64s += k.u64s;
-       bkey_p_copy(unwritten_whiteouts_start(b), &k);
-}
-
-/*
- * write lock must be held on @b (else the dirty bset that we were going to
- * insert into could be written out from under us)
- */
-static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
-{
-       if (unlikely(btree_node_need_rewrite(b)))
-               return false;
-
-       return u64s <= bch2_btree_keys_u64s_remaining(b);
-}
-
-void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
-
-bool bch2_btree_interior_updates_flush(struct bch_fs *);
-
-void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
-struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
-                                       struct jset_entry *, unsigned long);
-
-void bch2_async_btree_node_rewrites_flush(struct bch_fs *);
-void bch2_do_pending_node_rewrites(struct bch_fs *);
-void bch2_free_pending_node_rewrites(struct bch_fs *);
-
-void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_btree_interior_update_exit(struct bch_fs *);
-void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
-int bch2_fs_btree_interior_update_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
deleted file mode 100644 (file)
index 0afb44c..0000000
+++ /dev/null
@@ -1,893 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "disk_accounting.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-
-#include <linux/prefetch.h>
-#include <linux/sort.h>
-
-static int bch2_btree_write_buffer_journal_flush(struct journal *,
-                               struct journal_entry_pin *, u64);
-
-static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
-{
-       return (cmp_int(l->hi, r->hi) ?:
-               cmp_int(l->mi, r->mi) ?:
-               cmp_int(l->lo, r->lo)) >= 0;
-}
-
-static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
-{
-#ifdef CONFIG_X86_64
-       int cmp;
-
-       asm("mov   (%[l]), %%rax;"
-           "sub   (%[r]), %%rax;"
-           "mov  8(%[l]), %%rax;"
-           "sbb  8(%[r]), %%rax;"
-           "mov 16(%[l]), %%rax;"
-           "sbb 16(%[r]), %%rax;"
-           : "=@ccae" (cmp)
-           : [l] "r" (l), [r] "r" (r)
-           : "rax", "cc");
-
-       EBUG_ON(cmp != __wb_key_ref_cmp(l, r));
-       return cmp;
-#else
-       return __wb_key_ref_cmp(l, r);
-#endif
-}
-
-static int wb_key_seq_cmp(const void *_l, const void *_r)
-{
-       const struct btree_write_buffered_key *l = _l;
-       const struct btree_write_buffered_key *r = _r;
-
-       return cmp_int(l->journal_seq, r->journal_seq);
-}
-
-/* Compare excluding idx, the low 24 bits: */
-static inline bool wb_key_eq(const void *_l, const void *_r)
-{
-       const struct wb_key_ref *l = _l;
-       const struct wb_key_ref *r = _r;
-
-       return !((l->hi ^ r->hi)|
-                (l->mi ^ r->mi)|
-                ((l->lo >> 24) ^ (r->lo >> 24)));
-}
-
-static noinline void wb_sort(struct wb_key_ref *base, size_t num)
-{
-       size_t n = num, a = num / 2;
-
-       if (!a)         /* num < 2 || size == 0 */
-               return;
-
-       for (;;) {
-               size_t b, c, d;
-
-               if (a)                  /* Building heap: sift down --a */
-                       --a;
-               else if (--n)           /* Sorting: Extract root to --n */
-                       swap(base[0], base[n]);
-               else                    /* Sort complete */
-                       break;
-
-               /*
-                * Sift element at "a" down into heap.  This is the
-                * "bottom-up" variant, which significantly reduces
-                * calls to cmp_func(): we find the sift-down path all
-                * the way to the leaves (one compare per level), then
-                * backtrack to find where to insert the target element.
-                *
-                * Because elements tend to sift down close to the leaves,
-                * this uses fewer compares than doing two per level
-                * on the way down.  (A bit more than half as many on
-                * average, 3/4 worst-case.)
-                */
-               for (b = a; c = 2*b + 1, (d = c + 1) < n;)
-                       b = wb_key_ref_cmp(base + c, base + d) ? c : d;
-               if (d == n)             /* Special case last leaf with no sibling */
-                       b = c;
-
-               /* Now backtrack from "b" to the correct location for "a" */
-               while (b != a && wb_key_ref_cmp(base + a, base + b))
-                       b = (b - 1) / 2;
-               c = b;                  /* Where "a" belongs */
-               while (b != a) {        /* Shift it into place */
-                       b = (b - 1) / 2;
-                       swap(base[b], base[c]);
-               }
-       }
-}
-
-static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
-                                         struct btree_iter *iter,
-                                         struct btree_write_buffered_key *wb)
-{
-       struct btree_path *path = btree_iter_path(trans, iter);
-
-       bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-
-       trans->journal_res.seq = wb->journal_seq;
-
-       return bch2_trans_update(trans, iter, &wb->k,
-                                BTREE_UPDATE_internal_snapshot_node) ?:
-               bch2_trans_commit(trans, NULL, NULL,
-                                 BCH_TRANS_COMMIT_no_enospc|
-                                 BCH_TRANS_COMMIT_no_check_rw|
-                                 BCH_TRANS_COMMIT_no_journal_res|
-                                 BCH_TRANS_COMMIT_journal_reclaim);
-}
-
-static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
-                              struct btree_write_buffered_key *wb,
-                              bool *write_locked,
-                              bool *accounting_accumulated,
-                              size_t *fast)
-{
-       struct btree_path *path;
-       int ret;
-
-       EBUG_ON(!wb->journal_seq);
-       EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
-       EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
-
-       ret = bch2_btree_iter_traverse(trans, iter);
-       if (ret)
-               return ret;
-
-       if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
-               struct bkey u;
-               struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
-
-               if (k.k->type == KEY_TYPE_accounting)
-                       bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
-                                                  bkey_s_c_to_accounting(k));
-       }
-       *accounting_accumulated = true;
-
-       /*
-        * We can't clone a path that has write locks: unshare it now, before
-        * set_pos and traverse():
-        */
-       if (btree_iter_path(trans, iter)->ref > 1)
-               iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
-
-       path = btree_iter_path(trans, iter);
-
-       if (!*write_locked) {
-               ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
-               if (ret)
-                       return ret;
-
-               bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
-               *write_locked = true;
-       }
-
-       if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
-               *write_locked = false;
-               return wb_flush_one_slowpath(trans, iter, wb);
-       }
-
-       EBUG_ON(!bpos_eq(wb->k.k.p, path->pos));
-
-       bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
-       (*fast)++;
-       return 0;
-}
-
-/*
- * Update a btree with a write buffered key using the journal seq of the
- * original write buffer insert.
- *
- * It is not safe to rejournal the key once it has been inserted into the write
- * buffer because that may break recovery ordering. For example, the key may
- * have already been modified in the active write buffer in a seq that comes
- * before the current transaction. If we were to journal this key again and
- * crash, recovery would process updates in the wrong order.
- */
-static int
-btree_write_buffered_insert(struct btree_trans *trans,
-                         struct btree_write_buffered_key *wb)
-{
-       struct btree_iter iter;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
-                            BTREE_ITER_cached|BTREE_ITER_intent);
-
-       trans->journal_res.seq = wb->journal_seq;
-
-       ret   = bch2_btree_iter_traverse(trans, &iter) ?:
-               bch2_trans_update(trans, &iter, &wb->k,
-                                 BTREE_UPDATE_internal_snapshot_node);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
-{
-       struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
-       struct journal *j = &c->journal;
-
-       if (!wb->inc.keys.nr)
-               return;
-
-       bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
-                            bch2_btree_write_buffer_journal_flush);
-
-       darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
-       darray_resize(&wb->sorted, wb->flushing.keys.size);
-
-       if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
-               swap(wb->flushing.keys, wb->inc.keys);
-               goto out;
-       }
-
-       size_t nr = min(darray_room(wb->flushing.keys),
-                       wb->sorted.size - wb->flushing.keys.nr);
-       nr = min(nr, wb->inc.keys.nr);
-
-       memcpy(&darray_top(wb->flushing.keys),
-              wb->inc.keys.data,
-              sizeof(wb->inc.keys.data[0]) * nr);
-
-       memmove(wb->inc.keys.data,
-               wb->inc.keys.data + nr,
-              sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
-
-       wb->flushing.keys.nr    += nr;
-       wb->inc.keys.nr         -= nr;
-out:
-       if (!wb->inc.keys.nr)
-               bch2_journal_pin_drop(j, &wb->inc.pin);
-       else
-               bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
-                                       bch2_btree_write_buffer_journal_flush);
-
-       if (j->watermark) {
-               spin_lock(&j->lock);
-               bch2_journal_set_watermark(j);
-               spin_unlock(&j->lock);
-       }
-
-       BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
-}
-
-int bch2_btree_write_buffer_insert_err(struct bch_fs *c,
-                                      enum btree_id btree, struct bkey_i *k)
-{
-       struct printbuf buf = PRINTBUF;
-
-       prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
-       bch2_btree_id_to_text(&buf, btree);
-       prt_str(&buf, "\n");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
-       bch2_fs_inconsistent(c, "%s", buf.buf);
-       printbuf_exit(&buf);
-       return -EROFS;
-}
-
-static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct journal *j = &c->journal;
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-       struct btree_iter iter = {};
-       size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
-       bool write_locked = false;
-       bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
-       int ret = 0;
-
-       ret = bch2_journal_error(&c->journal);
-       if (ret)
-               return ret;
-
-       bch2_trans_unlock(trans);
-       bch2_trans_begin(trans);
-
-       mutex_lock(&wb->inc.lock);
-       move_keys_from_inc_to_flushing(wb);
-       mutex_unlock(&wb->inc.lock);
-
-       for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
-               wb->sorted.data[i].idx = i;
-               wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
-               memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
-       }
-       wb->sorted.nr = wb->flushing.keys.nr;
-
-       /*
-        * We first sort so that we can detect and skip redundant updates, and
-        * then we attempt to flush in sorted btree order, as this is most
-        * efficient.
-        *
-        * However, since we're not flushing in the order they appear in the
-        * journal we won't be able to drop our journal pin until everything is
-        * flushed - which means this could deadlock the journal if we weren't
-        * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
-        * if it would block taking a journal reservation.
-        *
-        * If that happens, simply skip the key so we can optimistically insert
-        * as many keys as possible in the fast path.
-        */
-       wb_sort(wb->sorted.data, wb->sorted.nr);
-
-       darray_for_each(wb->sorted, i) {
-               struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
-
-               if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
-                       ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k);
-                       goto err;
-               }
-
-               for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
-                       prefetch(&wb->flushing.keys.data[n->idx]);
-
-               BUG_ON(!k->journal_seq);
-
-               if (!accounting_replay_done &&
-                   k->k.k.type == KEY_TYPE_accounting) {
-                       slowpath++;
-                       continue;
-               }
-
-               if (i + 1 < &darray_top(wb->sorted) &&
-                   wb_key_eq(i, i + 1)) {
-                       struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
-
-                       if (k->k.k.type == KEY_TYPE_accounting &&
-                           n->k.k.type == KEY_TYPE_accounting)
-                               bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
-                                                          bkey_i_to_s_c_accounting(&k->k));
-
-                       overwritten++;
-                       n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
-                       k->journal_seq = 0;
-                       continue;
-               }
-
-               if (write_locked) {
-                       struct btree_path *path = btree_iter_path(trans, &iter);
-
-                       if (path->btree_id != i->btree ||
-                           bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
-                               bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-                               write_locked = false;
-
-                               ret = lockrestart_do(trans,
-                                       bch2_btree_iter_traverse(trans, &iter) ?:
-                                       bch2_foreground_maybe_merge(trans, iter.path, 0,
-                                                       BCH_WATERMARK_reclaim|
-                                                       BCH_TRANS_COMMIT_journal_reclaim|
-                                                       BCH_TRANS_COMMIT_no_check_rw|
-                                                       BCH_TRANS_COMMIT_no_enospc));
-                               if (ret)
-                                       goto err;
-                       }
-               }
-
-               if (!iter.path || iter.btree_id != k->btree) {
-                       bch2_trans_iter_exit(trans, &iter);
-                       bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
-                                            BTREE_ITER_intent|BTREE_ITER_all_snapshots);
-               }
-
-               bch2_btree_iter_set_pos(trans, &iter, k->k.k.p);
-               btree_iter_path(trans, &iter)->preserve = false;
-
-               bool accounting_accumulated = false;
-               do {
-                       if (race_fault()) {
-                               ret = bch_err_throw(c, journal_reclaim_would_deadlock);
-                               break;
-                       }
-
-                       ret = wb_flush_one(trans, &iter, k, &write_locked,
-                                          &accounting_accumulated, &fast);
-                       if (!write_locked)
-                               bch2_trans_begin(trans);
-               } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
-               if (!ret) {
-                       k->journal_seq = 0;
-               } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
-                       slowpath++;
-                       ret = 0;
-               } else
-                       break;
-       }
-
-       if (write_locked) {
-               struct btree_path *path = btree_iter_path(trans, &iter);
-               bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (ret)
-               goto err;
-
-       if (slowpath) {
-               /*
-                * Flush in the order they were present in the journal, so that
-                * we can release journal pins:
-                * The fastpath zapped the seq of keys that were successfully flushed so
-                * we can skip those here.
-                */
-               trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
-
-               sort_nonatomic(wb->flushing.keys.data,
-                              wb->flushing.keys.nr,
-                              sizeof(wb->flushing.keys.data[0]),
-                              wb_key_seq_cmp, NULL);
-
-               darray_for_each(wb->flushing.keys, i) {
-                       if (!i->journal_seq)
-                               continue;
-
-                       if (!accounting_replay_done &&
-                           i->k.k.type == KEY_TYPE_accounting) {
-                               could_not_insert++;
-                               continue;
-                       }
-
-                       if (!could_not_insert)
-                               bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
-                                                       bch2_btree_write_buffer_journal_flush);
-
-                       bch2_trans_begin(trans);
-
-                       ret = commit_do(trans, NULL, NULL,
-                                       BCH_WATERMARK_reclaim|
-                                       BCH_TRANS_COMMIT_journal_reclaim|
-                                       BCH_TRANS_COMMIT_no_check_rw|
-                                       BCH_TRANS_COMMIT_no_enospc|
-                                       BCH_TRANS_COMMIT_no_journal_res ,
-                                       btree_write_buffered_insert(trans, i));
-                       if (ret)
-                               goto err;
-
-                       i->journal_seq = 0;
-               }
-
-               /*
-                * If journal replay hasn't finished with accounting keys we
-                * can't flush accounting keys at all - condense them and leave
-                * them for next time.
-                *
-                * Q: Can the write buffer overflow?
-                * A Shouldn't be any actual risk. It's just new accounting
-                * updates that the write buffer can't flush, and those are only
-                * going to be generated by interior btree node updates as
-                * journal replay has to split/rewrite nodes to make room for
-                * its updates.
-                *
-                * And for those new acounting updates, updates to the same
-                * counters get accumulated as they're flushed from the journal
-                * to the write buffer - see the patch for eytzingcer tree
-                * accumulated. So we could only overflow if the number of
-                * distinct counters touched somehow was very large.
-                */
-               if (could_not_insert) {
-                       struct btree_write_buffered_key *dst = wb->flushing.keys.data;
-
-                       darray_for_each(wb->flushing.keys, i)
-                               if (i->journal_seq)
-                                       *dst++ = *i;
-                       wb->flushing.keys.nr = dst - wb->flushing.keys.data;
-               }
-       }
-err:
-       if (ret || !could_not_insert) {
-               bch2_journal_pin_drop(j, &wb->flushing.pin);
-               wb->flushing.keys.nr = 0;
-       }
-
-       bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
-       trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
-       return ret;
-}
-
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
-{
-       struct journal_keys_to_wb dst;
-       int ret = 0;
-
-       bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
-
-       for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
-               jset_entry_for_each_key(entry, k) {
-                       ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
-                       if (ret)
-                               goto out;
-               }
-
-               entry->type = BCH_JSET_ENTRY_btree_keys;
-       }
-out:
-       ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
-       return ret;
-}
-
-static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq)
-{
-       struct journal *j = &c->journal;
-       struct journal_buf *buf;
-       bool blocked;
-       int ret = 0;
-
-       while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) {
-               ret = bch2_journal_keys_to_write_buffer(c, buf);
-
-               if (!blocked && !ret) {
-                       spin_lock(&j->lock);
-                       buf->need_flush_to_write_buffer = false;
-                       spin_unlock(&j->lock);
-               }
-
-               mutex_unlock(&j->buf_lock);
-
-               if (blocked) {
-                       bch2_journal_unblock(j);
-                       break;
-               }
-       }
-
-       return ret;
-}
-
-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq,
-                                       bool *did_work)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-       int ret = 0, fetch_from_journal_err;
-
-       do {
-               bch2_trans_unlock(trans);
-
-               fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq);
-
-               *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
-
-               /*
-                * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
-                * is not guaranteed to empty wb->inc:
-                */
-               mutex_lock(&wb->flushing.lock);
-               ret = bch2_btree_write_buffer_flush_locked(trans);
-               mutex_unlock(&wb->flushing.lock);
-       } while (!ret &&
-                (fetch_from_journal_err ||
-                 (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) ||
-                 (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq)));
-
-       return ret;
-}
-
-static int bch2_btree_write_buffer_journal_flush(struct journal *j,
-                               struct journal_entry_pin *_pin, u64 seq)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       bool did_work = false;
-
-       return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work));
-}
-
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       bool did_work = false;
-
-       trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
-
-       return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work);
-}
-
-/*
- * The write buffer requires flushing when going RO: keys in the journal for the
- * write buffer don't have a journal pin yet
- */
-bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c)
-{
-       if (bch2_journal_error(&c->journal))
-               return false;
-
-       bool did_work = false;
-       bch2_trans_run(c, btree_write_buffer_flush_seq(trans,
-                               journal_cur_seq(&c->journal), &did_work));
-       return did_work;
-}
-
-int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-       int ret = 0;
-
-       if (mutex_trylock(&wb->flushing.lock)) {
-               ret = bch2_btree_write_buffer_flush_locked(trans);
-               mutex_unlock(&wb->flushing.lock);
-       }
-
-       return ret;
-}
-
-int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer))
-               return bch_err_throw(c, erofs_no_writes);
-
-       int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
-       return ret;
-}
-
-/*
- * In check and repair code, when checking references to write buffer btrees we
- * need to issue a flush before we have a definitive error: this issues a flush
- * if this is a key we haven't yet checked.
- */
-int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
-                                       struct bkey_s_c referring_k,
-                                       struct bkey_buf *last_flushed)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_buf tmp;
-       int ret = 0;
-
-       bch2_bkey_buf_init(&tmp);
-
-       if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
-               if (trace_write_buffer_maybe_flush_enabled()) {
-                       struct printbuf buf = PRINTBUF;
-
-                       bch2_bkey_val_to_text(&buf, c, referring_k);
-                       trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf);
-                       printbuf_exit(&buf);
-               }
-
-               bch2_bkey_buf_reassemble(&tmp, c, referring_k);
-
-               if (bkey_is_btree_ptr(referring_k.k)) {
-                       bch2_trans_unlock(trans);
-                       bch2_btree_interior_updates_flush(c);
-               }
-
-               ret = bch2_btree_write_buffer_flush_sync(trans);
-               if (ret)
-                       goto err;
-
-               bch2_bkey_buf_copy(last_flushed, c, tmp.k);
-
-               /* can we avoid the unconditional restart? */
-               trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_);
-               ret = bch_err_throw(c, transaction_restart_write_buffer_flush);
-       }
-err:
-       bch2_bkey_buf_exit(&tmp, c);
-       return ret;
-}
-
-static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
-{
-       struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-       int ret;
-
-       mutex_lock(&wb->flushing.lock);
-       do {
-               ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
-       } while (!ret && bch2_btree_write_buffer_should_flush(c));
-       mutex_unlock(&wb->flushing.lock);
-
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
-}
-
-static void wb_accounting_sort(struct btree_write_buffer *wb)
-{
-       eytzinger0_sort(wb->accounting.data, wb->accounting.nr,
-                       sizeof(wb->accounting.data[0]),
-                       wb_key_cmp, NULL);
-}
-
-int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
-                                      struct bkey_i_accounting *k)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-       struct btree_write_buffered_key new = { .btree = btree };
-
-       bkey_copy(&new.k, &k->k_i);
-
-       int ret = darray_push(&wb->accounting, new);
-       if (ret)
-               return ret;
-
-       wb_accounting_sort(wb);
-       return 0;
-}
-
-int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
-                            struct journal_keys_to_wb *dst,
-                            enum btree_id btree, struct bkey_i *k)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-       int ret;
-retry:
-       ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
-       if (!ret && dst->wb == &wb->flushing)
-               ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
-
-       if (unlikely(ret)) {
-               if (dst->wb == &c->btree_write_buffer.flushing) {
-                       mutex_unlock(&dst->wb->lock);
-                       dst->wb = &c->btree_write_buffer.inc;
-                       bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
-                                            bch2_btree_write_buffer_journal_flush);
-                       goto retry;
-               }
-
-               return ret;
-       }
-
-       dst->room = darray_room(dst->wb->keys);
-       if (dst->wb == &wb->flushing)
-               dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
-       BUG_ON(!dst->room);
-       BUG_ON(!dst->seq);
-
-       struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
-       wb_k->journal_seq       = dst->seq;
-       wb_k->btree             = btree;
-       bkey_copy(&wb_k->k, k);
-       dst->wb->keys.nr++;
-       dst->room--;
-       return 0;
-}
-
-void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-       if (mutex_trylock(&wb->flushing.lock)) {
-               mutex_lock(&wb->inc.lock);
-               move_keys_from_inc_to_flushing(wb);
-
-               /*
-                * Attempt to skip wb->inc, and add keys directly to
-                * wb->flushing, saving us a copy later:
-                */
-
-               if (!wb->inc.keys.nr) {
-                       dst->wb = &wb->flushing;
-               } else {
-                       mutex_unlock(&wb->flushing.lock);
-                       dst->wb = &wb->inc;
-               }
-       } else {
-               mutex_lock(&wb->inc.lock);
-               dst->wb = &wb->inc;
-       }
-
-       dst->room = darray_room(dst->wb->keys);
-       if (dst->wb == &wb->flushing)
-               dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
-       dst->seq = seq;
-
-       bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
-                            bch2_btree_write_buffer_journal_flush);
-
-       darray_for_each(wb->accounting, i)
-               memset(&i->k.v, 0, bkey_val_bytes(&i->k.k));
-}
-
-int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-       unsigned live_accounting_keys = 0;
-       int ret = 0;
-
-       darray_for_each(wb->accounting, i)
-               if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) {
-                       i->journal_seq = dst->seq;
-                       live_accounting_keys++;
-                       ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k);
-                       if (ret)
-                               break;
-               }
-
-       if (live_accounting_keys * 2 < wb->accounting.nr) {
-               struct btree_write_buffered_key *dst = wb->accounting.data;
-
-               darray_for_each(wb->accounting, src)
-                       if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k)))
-                               *dst++ = *src;
-               wb->accounting.nr = dst - wb->accounting.data;
-               wb_accounting_sort(wb);
-       }
-
-       if (!dst->wb->keys.nr)
-               bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
-
-       if (bch2_btree_write_buffer_should_flush(c) &&
-           __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) &&
-           !queue_work(system_dfl_wq, &c->btree_write_buffer.flush_work))
-               enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
-
-       if (dst->wb == &wb->flushing)
-               mutex_unlock(&wb->flushing.lock);
-       mutex_unlock(&wb->inc.lock);
-
-       return ret;
-}
-
-static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
-{
-       if (wb->keys.size >= new_size)
-               return 0;
-
-       if (!mutex_trylock(&wb->lock))
-               return -EINTR;
-
-       int ret = darray_resize(&wb->keys, new_size);
-       mutex_unlock(&wb->lock);
-       return ret;
-}
-
-int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-       return wb_keys_resize(&wb->flushing, new_size) ?:
-               wb_keys_resize(&wb->inc, new_size);
-}
-
-void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-       BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
-              !bch2_journal_error(&c->journal));
-
-       darray_exit(&wb->accounting);
-       darray_exit(&wb->sorted);
-       darray_exit(&wb->flushing.keys);
-       darray_exit(&wb->inc.keys);
-}
-
-void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-       mutex_init(&wb->inc.lock);
-       mutex_init(&wb->flushing.lock);
-       INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
-}
-
-int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-       /* Will be resized by journal as needed: */
-       unsigned initial_size = 1 << 16;
-
-       return  darray_make_room(&wb->inc.keys, initial_size) ?:
-               darray_make_room(&wb->flushing.keys, initial_size) ?:
-               darray_make_room(&wb->sorted, initial_size);
-}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
deleted file mode 100644 (file)
index c351d21..0000000
+++ /dev/null
@@ -1,113 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
-#define _BCACHEFS_BTREE_WRITE_BUFFER_H
-
-#include "bkey.h"
-#include "disk_accounting.h"
-
-static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-       return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
-}
-
-static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-       return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
-}
-
-struct btree_trans;
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *);
-int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
-int bch2_btree_write_buffer_tryflush(struct btree_trans *);
-
-struct bkey_buf;
-int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *);
-
-struct journal_keys_to_wb {
-       struct btree_write_buffer_keys  *wb;
-       size_t                          room;
-       u64                             seq;
-};
-
-static inline int wb_key_cmp(const void *_l, const void *_r)
-{
-       const struct btree_write_buffered_key *l = _l;
-       const struct btree_write_buffered_key *r = _r;
-
-       return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p);
-}
-
-int bch2_accounting_key_to_wb_slowpath(struct bch_fs *,
-                             enum btree_id, struct bkey_i_accounting *);
-
-static inline int bch2_accounting_key_to_wb(struct bch_fs *c,
-                            enum btree_id btree, struct bkey_i_accounting *k)
-{
-       struct btree_write_buffer *wb = &c->btree_write_buffer;
-       struct btree_write_buffered_key search;
-       search.btree = btree;
-       search.k.k.p = k->k.p;
-
-       unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr,
-                       sizeof(wb->accounting.data[0]),
-                       wb_key_cmp, &search);
-
-       if (idx >= wb->accounting.nr)
-               return bch2_accounting_key_to_wb_slowpath(c, btree, k);
-
-       struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k);
-       bch2_accounting_accumulate(dst, accounting_i_to_s_c(k));
-       return 0;
-}
-
-int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
-                            struct journal_keys_to_wb *,
-                            enum btree_id, struct bkey_i *);
-
-static inline int __bch2_journal_key_to_wb(struct bch_fs *c,
-                            struct journal_keys_to_wb *dst,
-                            enum btree_id btree, struct bkey_i *k)
-{
-       if (unlikely(!dst->room))
-               return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
-
-       struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
-       wb_k->journal_seq       = dst->seq;
-       wb_k->btree             = btree;
-       bkey_copy(&wb_k->k, k);
-       dst->wb->keys.nr++;
-       dst->room--;
-       return 0;
-}
-
-static inline int bch2_journal_key_to_wb(struct bch_fs *c,
-                            struct journal_keys_to_wb *dst,
-                            enum btree_id btree, struct bkey_i *k)
-{
-       if (unlikely(!btree_type_uses_write_buffer(btree))) {
-               int ret = bch2_btree_write_buffer_insert_err(c, btree, k);
-               dump_stack();
-               return ret;
-       }
-
-       EBUG_ON(!dst->seq);
-
-       return k->k.type == KEY_TYPE_accounting
-               ? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k))
-               : __bch2_journal_key_to_wb(c, dst, btree, k);
-}
-
-void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
-int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
-
-int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
-void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
-void bch2_fs_btree_write_buffer_init_early(struct bch_fs *);
-int bch2_fs_btree_write_buffer_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
deleted file mode 100644 (file)
index e9e76e2..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
-#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
-
-#include "darray.h"
-#include "journal_types.h"
-
-#define BTREE_WRITE_BUFERED_VAL_U64s_MAX       4
-#define BTREE_WRITE_BUFERED_U64s_MAX   (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
-
-struct wb_key_ref {
-union {
-       struct {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-               unsigned                        idx:24;
-               u8                              pos[sizeof(struct bpos)];
-               enum btree_id                   btree:8;
-#else
-               enum btree_id                   btree:8;
-               u8                              pos[sizeof(struct bpos)];
-               unsigned                        idx:24;
-#endif
-       } __packed;
-       struct {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-               u64 lo;
-               u64 mi;
-               u64 hi;
-#else
-               u64 hi;
-               u64 mi;
-               u64 lo;
-#endif
-       };
-};
-};
-
-struct btree_write_buffered_key {
-       enum btree_id                   btree:8;
-       u64                             journal_seq:56;
-       __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
-};
-
-struct btree_write_buffer_keys {
-       DARRAY(struct btree_write_buffered_key) keys;
-       struct journal_entry_pin        pin;
-       struct mutex                    lock;
-};
-
-struct btree_write_buffer {
-       DARRAY(struct wb_key_ref)       sorted;
-       struct btree_write_buffer_keys  inc;
-       struct btree_write_buffer_keys  flushing;
-       struct work_struct              flush_work;
-
-       DARRAY(struct btree_write_buffered_key) accounting;
-};
-
-#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
deleted file mode 100644 (file)
index f25903c..0000000
+++ /dev/null
@@ -1,1395 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "bset.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "error.h"
-#include "inode.h"
-#include "movinggc.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "reflink.h"
-#include "replicas.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/preempt.h>
-
-void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
-{
-       for (unsigned i = 0; i < BCH_DATA_NR; i++)
-               usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets);
-}
-
-void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage)
-{
-       memset(usage, 0, sizeof(*usage));
-       acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage,
-                       sizeof(struct bch_dev_usage_full) / sizeof(u64));
-}
-
-static u64 reserve_factor(u64 r)
-{
-       return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
-}
-
-static struct bch_fs_usage_short
-__bch2_fs_usage_read_short(struct bch_fs *c)
-{
-       struct bch_fs_usage_short ret;
-       u64 data, reserved;
-
-       ret.capacity = c->capacity -
-               percpu_u64_get(&c->usage->hidden);
-
-       data            = percpu_u64_get(&c->usage->data) +
-               percpu_u64_get(&c->usage->btree);
-       reserved        = percpu_u64_get(&c->usage->reserved) +
-               percpu_u64_get(c->online_reserved);
-
-       ret.used        = min(ret.capacity, data + reserve_factor(reserved));
-       ret.free        = ret.capacity - ret.used;
-
-       ret.nr_inodes   = percpu_u64_get(&c->usage->nr_inodes);
-
-       return ret;
-}
-
-struct bch_fs_usage_short
-bch2_fs_usage_read_short(struct bch_fs *c)
-{
-       struct bch_fs_usage_short ret;
-
-       percpu_down_read(&c->mark_lock);
-       ret = __bch2_fs_usage_read_short(c);
-       percpu_up_read(&c->mark_lock);
-
-       return ret;
-}
-
-void bch2_dev_usage_to_text(struct printbuf *out,
-                           struct bch_dev *ca,
-                           struct bch_dev_usage_full *usage)
-{
-       if (out->nr_tabstops < 5) {
-               printbuf_tabstops_reset(out);
-               printbuf_tabstop_push(out, 12);
-               printbuf_tabstop_push(out, 16);
-               printbuf_tabstop_push(out, 16);
-               printbuf_tabstop_push(out, 16);
-               printbuf_tabstop_push(out, 16);
-       }
-
-       prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
-
-       for (unsigned i = 0; i < BCH_DATA_NR; i++) {
-               bch2_prt_data_type(out, i);
-               prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
-                          usage->d[i].buckets,
-                          usage->d[i].sectors,
-                          usage->d[i].fragmented);
-       }
-
-       prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets);
-}
-
-static int bch2_check_fix_ptr(struct btree_trans *trans,
-                             struct bkey_s_c k,
-                             struct extent_ptr_decoded p,
-                             const union bch_extent_entry *entry,
-                             bool *do_update)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
-       if (!ca) {
-               if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
-                               trans, ptr_to_invalid_device,
-                               "pointer to missing device %u\n"
-                               "while marking %s",
-                               p.ptr.dev,
-                               (printbuf_reset(&buf),
-                                bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-                       *do_update = true;
-               return 0;
-       }
-
-       struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-       if (!g) {
-               if (fsck_err(trans, ptr_to_invalid_device,
-                            "pointer to invalid bucket on device %u\n"
-                            "while marking %s",
-                            p.ptr.dev,
-                            (printbuf_reset(&buf),
-                             bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-                       *do_update = true;
-               goto out;
-       }
-
-       enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
-
-       if (fsck_err_on(!g->gen_valid,
-                       trans, ptr_to_missing_alloc_key,
-                       "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
-                       "while marking %s",
-                       p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-                       bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-                       p.ptr.gen,
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               if (!p.ptr.cached) {
-                       g->gen_valid            = true;
-                       g->gen                  = p.ptr.gen;
-               } else {
-                       /* this pointer will be dropped */
-                       *do_update = true;
-                       goto out;
-               }
-       }
-
-       /* g->gen_valid == true */
-
-       if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
-                       trans, ptr_gen_newer_than_bucket_gen,
-                       "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
-                       "while marking %s",
-                       p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-                       bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-                       p.ptr.gen, g->gen,
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               if (!p.ptr.cached &&
-                   (g->data_type != BCH_DATA_btree ||
-                    data_type == BCH_DATA_btree)) {
-                       g->data_type            = data_type;
-                       g->stripe_sectors       = 0;
-                       g->dirty_sectors        = 0;
-                       g->cached_sectors       = 0;
-               }
-
-               *do_update = true;
-       }
-
-       if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
-                       trans, ptr_gen_newer_than_bucket_gen,
-                       "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
-                       "while marking %s",
-                       p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
-                       bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-                       p.ptr.gen,
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-               *do_update = true;
-
-       if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
-                       trans, stale_dirty_ptr,
-                       "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
-                       "while marking %s",
-                       p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-                       bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-                       p.ptr.gen, g->gen,
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-               *do_update = true;
-
-       if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
-               goto out;
-
-       if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
-                       trans, ptr_bucket_data_type_mismatch,
-                       "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
-                       "while marking %s",
-                       p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
-                       bch2_data_type_str(g->data_type),
-                       bch2_data_type_str(data_type),
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               if (!p.ptr.cached &&
-                   data_type == BCH_DATA_btree) {
-                       switch (g->data_type) {
-                       case BCH_DATA_sb:
-                               bch_err(c, "btree and superblock in the same bucket - cannot repair");
-                               ret = bch_err_throw(c, fsck_repair_unimplemented);
-                               goto out;
-                       case BCH_DATA_journal:
-                               ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr));
-                               bch_err_msg(c, ret, "error deleting journal bucket %zu",
-                                           PTR_BUCKET_NR(ca, &p.ptr));
-                               if (ret)
-                                       goto out;
-                               break;
-                       }
-
-                       g->data_type            = data_type;
-                       g->stripe_sectors       = 0;
-                       g->dirty_sectors        = 0;
-                       g->cached_sectors       = 0;
-               } else {
-                       *do_update = true;
-               }
-       }
-
-       if (p.has_ec) {
-               struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
-               if (fsck_err_on(!m || !m->alive,
-                               trans, ptr_to_missing_stripe,
-                               "pointer to nonexistent stripe %llu\n"
-                               "while marking %s",
-                               (u64) p.ec.idx,
-                               (printbuf_reset(&buf),
-                                bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-                       *do_update = true;
-
-               if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
-                               trans, ptr_to_incorrect_stripe,
-                               "pointer does not match stripe %llu\n"
-                               "while marking %s",
-                               (u64) p.ec.idx,
-                               (printbuf_reset(&buf),
-                                bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-                       *do_update = true;
-       }
-out:
-fsck_err:
-       bch2_dev_put(ca);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_check_fix_ptrs(struct btree_trans *trans,
-                       enum btree_id btree, unsigned level, struct bkey_s_c k,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry_c;
-       struct extent_ptr_decoded p = { 0 };
-       bool do_update = false;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       /* We don't yet do btree key updates correctly for when we're RW */
-       BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
-       bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
-               ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
-               if (ret)
-                       goto err;
-       }
-
-       if (do_update) {
-               struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-               ret = PTR_ERR_OR_ZERO(new);
-               if (ret)
-                       goto err;
-
-               scoped_guard(rcu)
-                       bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
-
-               if (level) {
-                       /*
-                        * We don't want to drop btree node pointers - if the
-                        * btree node isn't there anymore, the read path will
-                        * sort it out:
-                        */
-                       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-                       scoped_guard(rcu)
-                               bkey_for_each_ptr(ptrs, ptr) {
-                                       struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-                                       ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen;
-                               }
-               } else {
-                       struct bkey_ptrs ptrs;
-                       union bch_extent_entry *entry;
-
-                       rcu_read_lock();
-restart_drop_ptrs:
-                       ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-                       bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
-                               struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
-                               struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-                               enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
-
-                               if ((p.ptr.cached &&
-                                    (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
-                                   (!p.ptr.cached &&
-                                    gen_cmp(p.ptr.gen, g->gen) < 0) ||
-                                   gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
-                                   (g->data_type &&
-                                    g->data_type != data_type)) {
-                                       bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
-                                       goto restart_drop_ptrs;
-                               }
-                       }
-                       rcu_read_unlock();
-again:
-                       ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-                       bkey_extent_entry_for_each(ptrs, entry) {
-                               if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
-                                       struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
-                                                                       entry->stripe_ptr.idx);
-                                       union bch_extent_entry *next_ptr;
-
-                                       bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
-                                               if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
-                                                       goto found;
-                                       next_ptr = NULL;
-found:
-                                       if (!next_ptr) {
-                                               bch_err(c, "aieee, found stripe ptr with no data ptr");
-                                               continue;
-                                       }
-
-                                       if (!m || !m->alive ||
-                                           !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
-                                                                      &next_ptr->ptr,
-                                                                      m->sectors)) {
-                                               bch2_bkey_extent_entry_drop(new, entry);
-                                               goto again;
-                                       }
-                               }
-                       }
-               }
-
-               if (0) {
-                       printbuf_reset(&buf);
-                       bch2_bkey_val_to_text(&buf, c, k);
-                       bch_info(c, "updated %s", buf.buf);
-
-                       printbuf_reset(&buf);
-                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
-                       bch_info(c, "new key %s", buf.buf);
-               }
-
-               if (!(flags & BTREE_TRIGGER_is_root)) {
-                       struct btree_iter iter;
-                       bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
-                                                 BTREE_ITER_intent|BTREE_ITER_all_snapshots);
-                       ret =   bch2_btree_iter_traverse(trans, &iter) ?:
-                               bch2_trans_update(trans, &iter, new,
-                                                 BTREE_UPDATE_internal_snapshot_node|
-                                                 BTREE_TRIGGER_norun);
-                       bch2_trans_iter_exit(trans, &iter);
-                       if (ret)
-                               goto err;
-
-                       if (level)
-                               bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
-               } else {
-                       struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
-                                              jset_u64s(new->k.u64s));
-                       ret = PTR_ERR_OR_ZERO(e);
-                       if (ret)
-                               goto err;
-
-                       journal_entry_set(e,
-                                         BCH_JSET_ENTRY_btree_root,
-                                         btree, level - 1,
-                                         new, new->k.u64s);
-
-                       /*
-                        * no locking, we're single threaded and not rw yet, see
-                        * the big assertino above that we repeat here:
-                        */
-                       BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
-                       struct btree *b = bch2_btree_id_root(c, btree)->b;
-                       bkey_copy(&b->key, new);
-               }
-       }
-err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf,
-                                struct bkey_s_c k, bool insert, enum bch_sb_error_id id)
-{
-       struct bch_fs *c = trans->c;
-
-       prt_printf(buf, "\nwhile marking ");
-       bch2_bkey_val_to_text(buf, c, k);
-       prt_newline(buf);
-
-       bool print = __bch2_count_fsck_err(c, id, buf);
-
-       int ret = bch2_run_explicit_recovery_pass(c, buf,
-                                       BCH_RECOVERY_PASS_check_allocations, 0);
-
-       if (insert) {
-               bch2_trans_updates_to_text(buf, trans);
-               __bch2_inconsistent_error(c, buf);
-               /*
-                * If we're in recovery, run_explicit_recovery_pass might give
-                * us an error code for rewinding recovery
-                */
-               if (!ret)
-                       ret = bch_err_throw(c, bucket_ref_update);
-       } else {
-               /* Always ignore overwrite errors, so that deletion works */
-               ret = 0;
-       }
-
-       if (print || insert)
-               bch2_print_str(c, KERN_ERR, buf->buf);
-       return ret;
-}
-
-int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
-                          struct bkey_s_c k,
-                          const struct bch_extent_ptr *ptr,
-                          s64 sectors, enum bch_data_type ptr_data_type,
-                          u8 b_gen, u8 bucket_data_type,
-                          u32 *bucket_sectors)
-{
-       struct bch_fs *c = trans->c;
-       size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
-       struct printbuf buf = PRINTBUF;
-       bool inserting = sectors > 0;
-       int ret = 0;
-
-       BUG_ON(!sectors);
-
-       if (unlikely(gen_after(ptr->gen, b_gen))) {
-               bch2_log_msg_start(c, &buf);
-               prt_printf(&buf,
-                       "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen",
-                       ptr->dev, bucket_nr, b_gen,
-                       bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-                       ptr->gen);
-
-               ret = bucket_ref_update_err(trans, &buf, k, inserting,
-                                           BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen);
-               goto out;
-       }
-
-       if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) {
-               bch2_log_msg_start(c, &buf);
-               prt_printf(&buf,
-                       "bucket %u:%zu gen %u data type %s: ptr gen %u too stale",
-                       ptr->dev, bucket_nr, b_gen,
-                       bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-                       ptr->gen);
-
-               ret = bucket_ref_update_err(trans, &buf, k, inserting,
-                                           BCH_FSCK_ERR_ptr_too_stale);
-               goto out;
-       }
-
-       if (b_gen != ptr->gen && ptr->cached) {
-               ret = 1;
-               goto out;
-       }
-
-       if (unlikely(b_gen != ptr->gen)) {
-               bch2_log_msg_start(c, &buf);
-               prt_printf(&buf,
-                       "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)",
-                       ptr->dev, bucket_nr, b_gen,
-                       bucket_gen_get(ca, bucket_nr),
-                       bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-                       ptr->gen);
-
-               ret = bucket_ref_update_err(trans, &buf, k, inserting,
-                                           BCH_FSCK_ERR_stale_dirty_ptr);
-               goto out;
-       }
-
-       if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) {
-               bch2_log_msg_start(c, &buf);
-               prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s",
-                          ptr->dev, bucket_nr, b_gen,
-                          bch2_data_type_str(bucket_data_type),
-                          bch2_data_type_str(ptr_data_type));
-
-               ret = bucket_ref_update_err(trans, &buf, k, inserting,
-                                           BCH_FSCK_ERR_ptr_bucket_data_type_mismatch);
-               goto out;
-       }
-
-       if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) {
-               bch2_log_msg_start(c, &buf);
-               prt_printf(&buf,
-                       "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX",
-                       ptr->dev, bucket_nr, b_gen,
-                       bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-                       *bucket_sectors, sectors);
-
-               ret = bucket_ref_update_err(trans, &buf, k, inserting,
-                                           BCH_FSCK_ERR_bucket_sector_count_overflow);
-               sectors = -*bucket_sectors;
-               goto out;
-       }
-
-       *bucket_sectors += sectors;
-out:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-       static int warned_disk_usage = 0;
-       bool warn = false;
-
-       percpu_down_read(&c->mark_lock);
-       struct bch_fs_usage_base *src = &trans->fs_usage_delta;
-
-       s64 added = src->btree + src->data + src->reserved;
-
-       /*
-        * Not allowed to reduce sectors_available except by getting a
-        * reservation:
-        */
-       s64 should_not_have_added = added - (s64) disk_res_sectors;
-       if (unlikely(should_not_have_added > 0)) {
-               u64 old, new;
-
-               old = atomic64_read(&c->sectors_available);
-               do {
-                       new = max_t(s64, 0, old - should_not_have_added);
-               } while (!atomic64_try_cmpxchg(&c->sectors_available,
-                                              &old, new));
-
-               added -= should_not_have_added;
-               warn = true;
-       }
-
-       if (added > 0) {
-               trans->disk_res->sectors -= added;
-               this_cpu_sub(*c->online_reserved, added);
-       }
-
-       preempt_disable();
-       struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
-       acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
-       preempt_enable();
-       percpu_up_read(&c->mark_lock);
-
-       if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
-               bch2_trans_inconsistent(trans,
-                                       "disk usage increased %lli more than %llu sectors reserved)",
-                                       should_not_have_added, disk_res_sectors);
-}
-
-/* KEY_TYPE_extent: */
-
-static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
-                         struct bkey_s_c k,
-                         const struct extent_ptr_decoded *p,
-                         s64 sectors, enum bch_data_type ptr_data_type,
-                         struct bch_alloc_v4 *a,
-                         bool insert)
-{
-       u32 *dst_sectors = p->has_ec    ? &a->stripe_sectors :
-               !p->ptr.cached          ? &a->dirty_sectors :
-                                         &a->cached_sectors;
-       int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type,
-                                        a->gen, a->data_type, dst_sectors);
-
-       if (ret)
-               return ret;
-       if (insert)
-               alloc_data_type_set(a, ptr_data_type);
-       return 0;
-}
-
-static int bch2_trigger_pointer(struct btree_trans *trans,
-                       enum btree_id btree_id, unsigned level,
-                       struct bkey_s_c k, struct extent_ptr_decoded p,
-                       const union bch_extent_entry *entry,
-                       s64 *sectors,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       bool insert = !(flags & BTREE_TRIGGER_overwrite);
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       struct bkey_i_backpointer bp;
-       bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp);
-
-       *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len;
-
-       struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
-       if (unlikely(!ca)) {
-               if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
-                       ret = bch_err_throw(c, trigger_pointer);
-               goto err;
-       }
-
-       struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
-       if (!bucket_valid(ca, bucket.offset)) {
-               if (insert) {
-                       bch2_dev_bucket_missing(ca, bucket.offset);
-                       ret = bch_err_throw(c, trigger_pointer);
-               }
-               goto err;
-       }
-
-       if (flags & BTREE_TRIGGER_transactional) {
-               struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
-               ret = PTR_ERR_OR_ZERO(a) ?:
-                       __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert);
-               if (ret)
-                       goto err;
-
-               ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
-               if (ret)
-                       goto err;
-       }
-
-       if (flags & BTREE_TRIGGER_gc) {
-               struct bucket *g = gc_bucket(ca, bucket.offset);
-               if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
-                                           p.ptr.dev,
-                                           (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-                       ret = bch_err_throw(c, trigger_pointer);
-                       goto err;
-               }
-
-               bucket_lock(g);
-               struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
-               ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert);
-               alloc_to_bucket(g, new);
-               bucket_unlock(g);
-
-               if (!ret)
-                       ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
-       }
-err:
-       bch2_dev_put(ca);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
-                               struct bkey_s_c k,
-                               struct extent_ptr_decoded p,
-                               enum bch_data_type data_type,
-                               s64 sectors,
-                               enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-
-       if (flags & BTREE_TRIGGER_transactional) {
-               struct btree_iter iter;
-               struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
-                               BTREE_ID_stripes, POS(0, p.ec.idx),
-                               BTREE_ITER_with_updates, stripe);
-               int ret = PTR_ERR_OR_ZERO(s);
-               if (unlikely(ret)) {
-                       bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
-                               "pointer to nonexistent stripe %llu",
-                               (u64) p.ec.idx);
-                       goto err;
-               }
-
-               if (!bch2_ptr_matches_stripe(&s->v, p)) {
-                       bch2_trans_inconsistent(trans,
-                               "stripe pointer doesn't match stripe %llu",
-                               (u64) p.ec.idx);
-                       ret = bch_err_throw(c, trigger_stripe_pointer);
-                       goto err;
-               }
-
-               stripe_blockcount_set(&s->v, p.ec.block,
-                       stripe_blockcount_get(&s->v, p.ec.block) +
-                       sectors);
-
-               struct disk_accounting_pos acc;
-               memset(&acc, 0, sizeof(acc));
-               acc.type = BCH_DISK_ACCOUNTING_replicas;
-               bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
-               acc.replicas.data_type = data_type;
-               ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
-err:
-               bch2_trans_iter_exit(trans, &iter);
-               return ret;
-       }
-
-       if (flags & BTREE_TRIGGER_gc) {
-               struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
-               if (!m) {
-                       bch_err(c, "error allocating memory for gc_stripes, idx %llu",
-                               (u64) p.ec.idx);
-                       return bch_err_throw(c, ENOMEM_mark_stripe_ptr);
-               }
-
-               gc_stripe_lock(m);
-
-               if (!m || !m->alive) {
-                       gc_stripe_unlock(m);
-                       struct printbuf buf = PRINTBUF;
-                       bch2_log_msg_start(c, &buf);
-                       prt_printf(&buf, "pointer to nonexistent stripe %llu\n  while marking ",
-                                  (u64) p.ec.idx);
-                       bch2_bkey_val_to_text(&buf, c, k);
-                       __bch2_inconsistent_error(c, &buf);
-                       bch2_print_str(c, KERN_ERR, buf.buf);
-                       printbuf_exit(&buf);
-                       return bch_err_throw(c, trigger_stripe_pointer);
-               }
-
-               m->block_sectors[p.ec.block] += sectors;
-
-               struct disk_accounting_pos acc;
-               memset(&acc, 0, sizeof(acc));
-               acc.type = BCH_DISK_ACCOUNTING_replicas;
-               unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA");
-               gc_stripe_unlock(m);
-
-               acc.replicas.data_type = data_type;
-               int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static int __trigger_extent(struct btree_trans *trans,
-                           enum btree_id btree_id, unsigned level,
-                           struct bkey_s_c k,
-                           enum btree_iter_update_trigger_flags flags)
-{
-       bool gc = flags & BTREE_TRIGGER_gc;
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
-               ? BCH_DATA_btree
-               : BCH_DATA_user;
-       int ret = 0;
-
-       s64 replicas_sectors = 0;
-
-       struct disk_accounting_pos acc_replicas_key;
-       memset(&acc_replicas_key, 0, sizeof(acc_replicas_key));
-       acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas;
-       acc_replicas_key.replicas.data_type     = data_type;
-       acc_replicas_key.replicas.nr_devs       = 0;
-       acc_replicas_key.replicas.nr_required   = 1;
-
-       unsigned cur_compression_type = 0;
-       u64 compression_acct[3] = { 1, 0, 0 };
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               s64 disk_sectors = 0;
-               ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
-               if (ret < 0)
-                       return ret;
-
-               bool stale = ret > 0;
-
-               if (p.ptr.cached && stale)
-                       continue;
-
-               if (p.ptr.cached) {
-                       ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
-                       if (ret)
-                               return ret;
-               } else if (!p.has_ec) {
-                       replicas_sectors       += disk_sectors;
-                       replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
-               } else {
-                       ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
-                       if (ret)
-                               return ret;
-
-                       /*
-                        * There may be other dirty pointers in this extent, but
-                        * if so they're not required for mounting if we have an
-                        * erasure coded pointer in this extent:
-                        */
-                       acc_replicas_key.replicas.nr_required = 0;
-               }
-
-               if (cur_compression_type &&
-                   cur_compression_type != p.crc.compression_type) {
-                       if (flags & BTREE_TRIGGER_overwrite)
-                               bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
-
-                       ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
-                                                       compression, cur_compression_type);
-                       if (ret)
-                               return ret;
-
-                       compression_acct[0] = 1;
-                       compression_acct[1] = 0;
-                       compression_acct[2] = 0;
-               }
-
-               cur_compression_type = p.crc.compression_type;
-               if (p.crc.compression_type) {
-                       compression_acct[1] += p.crc.uncompressed_size;
-                       compression_acct[2] += p.crc.compressed_size;
-               }
-       }
-
-       if (acc_replicas_key.replicas.nr_devs) {
-               ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
-               if (ret)
-                       return ret;
-       }
-
-       if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
-               ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot);
-               if (ret)
-                       return ret;
-       }
-
-       if (cur_compression_type) {
-               if (flags & BTREE_TRIGGER_overwrite)
-                       bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
-
-               ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
-                                               compression, cur_compression_type);
-               if (ret)
-                       return ret;
-       }
-
-       if (level) {
-               ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id);
-               if (ret)
-                       return ret;
-       } else {
-               bool insert = !(flags & BTREE_TRIGGER_overwrite);
-
-               s64 v[3] = {
-                       insert ? 1 : -1,
-                       insert ? k.k->size : -((s64) k.k->size),
-                       replicas_sectors,
-               };
-               ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-int bch2_trigger_extent(struct btree_trans *trans,
-                       enum btree_id btree, unsigned level,
-                       struct bkey_s_c old, struct bkey_s new,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
-       struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
-       unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
-       unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
-
-       if (unlikely(flags & BTREE_TRIGGER_check_repair))
-               return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
-
-       /* if pointers aren't changing - nothing to do: */
-       if (new_ptrs_bytes == old_ptrs_bytes &&
-           !memcmp(new_ptrs.start,
-                   old_ptrs.start,
-                   new_ptrs_bytes))
-               return 0;
-
-       if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
-               if (old.k->type) {
-                       int ret = __trigger_extent(trans, btree, level, old,
-                                                  flags & ~BTREE_TRIGGER_insert);
-                       if (ret)
-                               return ret;
-               }
-
-               if (new.k->type) {
-                       int ret = __trigger_extent(trans, btree, level, new.s_c,
-                                                  flags & ~BTREE_TRIGGER_overwrite);
-                       if (ret)
-                               return ret;
-               }
-
-               int need_rebalance_delta = 0;
-               s64 need_rebalance_sectors_delta[1] = { 0 };
-
-               s64 s = bch2_bkey_sectors_need_rebalance(c, old);
-               need_rebalance_delta -= s != 0;
-               need_rebalance_sectors_delta[0] -= s;
-
-               s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
-               need_rebalance_delta += s != 0;
-               need_rebalance_sectors_delta[0] += s;
-
-               if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
-                       int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
-                                                         new.k->p, need_rebalance_delta > 0);
-                       if (ret)
-                               return ret;
-               }
-
-               if (need_rebalance_sectors_delta[0]) {
-                       int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
-                                                           need_rebalance_sectors_delta, rebalance_work);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       return 0;
-}
-
-/* KEY_TYPE_reservation */
-
-static int __trigger_reservation(struct btree_trans *trans,
-                       enum btree_id btree_id, unsigned level, struct bkey_s_c k,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
-               s64 sectors[1] = { k.k->size };
-
-               if (flags & BTREE_TRIGGER_overwrite)
-                       sectors[0] = -sectors[0];
-
-               return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors,
-                               persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas);
-       }
-
-       return 0;
-}
-
-int bch2_trigger_reservation(struct btree_trans *trans,
-                         enum btree_id btree_id, unsigned level,
-                         struct bkey_s_c old, struct bkey_s new,
-                         enum btree_iter_update_trigger_flags flags)
-{
-       return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
-}
-
-/* Mark superblocks: */
-
-static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
-                                   struct bch_dev *ca, u64 b,
-                                   enum bch_data_type type,
-                                   unsigned sectors)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       int ret = 0;
-
-       struct bkey_i_alloc_v4 *a =
-               bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
-       if (IS_ERR(a))
-               return PTR_ERR(a);
-
-       if (a->v.data_type && type && a->v.data_type != type) {
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-               prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
-                          "while marking %s\n",
-                          iter.pos.inode, iter.pos.offset, a->v.gen,
-                          bch2_data_type_str(a->v.data_type),
-                          bch2_data_type_str(type),
-                          bch2_data_type_str(type));
-
-               bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
-
-               ret = bch2_run_explicit_recovery_pass(c, &buf,
-                                       BCH_RECOVERY_PASS_check_allocations, 0);
-
-               /* Always print, this is always fatal */
-               bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-               if (!ret)
-                       ret = bch_err_throw(c, metadata_bucket_inconsistency);
-               goto err;
-       }
-
-       if (a->v.data_type      != type ||
-           a->v.dirty_sectors  != sectors) {
-               a->v.data_type          = type;
-               a->v.dirty_sectors      = sectors;
-               ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca,
-                       u64 b, enum bch_data_type data_type, unsigned sectors,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       int ret = 0;
-
-       struct bucket *g = gc_bucket(ca, b);
-       if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
-                                   ca->dev_idx, bch2_data_type_str(data_type)))
-               goto err;
-
-       bucket_lock(g);
-       struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
-
-       if (bch2_fs_inconsistent_on(g->data_type &&
-                       g->data_type != data_type, c,
-                       "different types of data in same bucket: %s, %s",
-                       bch2_data_type_str(g->data_type),
-                       bch2_data_type_str(data_type)))
-               goto err_unlock;
-
-       if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
-                       "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
-                       ca->dev_idx, b, g->gen,
-                       bch2_data_type_str(g->data_type ?: data_type),
-                       g->dirty_sectors, sectors))
-               goto err_unlock;
-
-       g->data_type = data_type;
-       g->dirty_sectors += sectors;
-       struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
-       bucket_unlock(g);
-       ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
-       return ret;
-err_unlock:
-       bucket_unlock(g);
-err:
-       return bch_err_throw(c, metadata_bucket_inconsistency);
-}
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
-                       struct bch_dev *ca, u64 b,
-                       enum bch_data_type type, unsigned sectors,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       BUG_ON(type != BCH_DATA_free &&
-              type != BCH_DATA_sb &&
-              type != BCH_DATA_journal);
-
-       /*
-        * Backup superblock might be past the end of our normal usable space:
-        */
-       if (b >= ca->mi.nbuckets)
-               return 0;
-
-       if (flags & BTREE_TRIGGER_gc)
-               return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags);
-       else if (flags & BTREE_TRIGGER_transactional)
-               return commit_do(trans, NULL, NULL, 0,
-                                __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
-       else
-               BUG();
-}
-
-static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
-                       struct bch_dev *ca, u64 start, u64 end,
-                       enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       do {
-               u64 b = sector_to_bucket(ca, start);
-               unsigned sectors =
-                       min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
-
-               if (b != *bucket && *bucket_sectors) {
-                       int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
-                                                       type, *bucket_sectors, flags);
-                       if (ret)
-                               return ret;
-
-                       *bucket_sectors = 0;
-               }
-
-               *bucket         = b;
-               *bucket_sectors += sectors;
-               start += sectors;
-       } while (start < end);
-
-       return 0;
-}
-
-static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-
-       mutex_lock(&c->sb_lock);
-       struct bch_sb_layout layout = ca->disk_sb.sb->layout;
-       mutex_unlock(&c->sb_lock);
-
-       u64 bucket = 0;
-       unsigned i, bucket_sectors = 0;
-       int ret;
-
-       for (i = 0; i < layout.nr_superblocks; i++) {
-               u64 offset = le64_to_cpu(layout.sb_offset[i]);
-
-               if (offset == BCH_SB_SECTOR) {
-                       ret = bch2_trans_mark_metadata_sectors(trans, ca,
-                                               0, BCH_SB_SECTOR,
-                                               BCH_DATA_sb, &bucket, &bucket_sectors, flags);
-                       if (ret)
-                               return ret;
-               }
-
-               ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
-                                     offset + (1 << layout.sb_max_size_bits),
-                                     BCH_DATA_sb, &bucket, &bucket_sectors, flags);
-               if (ret)
-                       return ret;
-       }
-
-       if (bucket_sectors) {
-               ret = bch2_trans_mark_metadata_bucket(trans, ca,
-                               bucket, BCH_DATA_sb, bucket_sectors, flags);
-               if (ret)
-                       return ret;
-       }
-
-       for (i = 0; i < ca->journal.nr; i++) {
-               ret = bch2_trans_mark_metadata_bucket(trans, ca,
-                               ca->journal.buckets[i],
-                               BCH_DATA_journal, ca->mi.bucket_size, flags);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       int ret = bch2_trans_run(c,
-               __bch2_trans_mark_dev_sb(trans, ca, flags));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) {
-               int ret = bch2_trans_mark_dev_sb(c, ca, flags);
-               if (ret) {
-                       enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs);
-                       return ret;
-               }
-       }
-
-       return 0;
-}
-
-int bch2_trans_mark_dev_sbs(struct bch_fs *c)
-{
-       return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
-}
-
-bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
-       struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
-       u64 b_offset    = bucket_to_sector(ca, b);
-       u64 b_end       = bucket_to_sector(ca, b + 1);
-       unsigned i;
-
-       if (!b)
-               return true;
-
-       for (i = 0; i < layout->nr_superblocks; i++) {
-               u64 offset = le64_to_cpu(layout->sb_offset[i]);
-               u64 end = offset + (1 << layout->sb_max_size_bits);
-
-               if (!(offset >= b_end || end <= b_offset))
-                       return true;
-       }
-
-       for (i = 0; i < ca->journal.nr; i++)
-               if (b == ca->journal.buckets[i])
-                       return true;
-
-       return false;
-}
-
-/* Disk reservations: */
-
-#define SECTORS_CACHE  1024
-
-int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
-                               u64 sectors, enum bch_reservation_flags flags)
-{
-       struct bch_fs_pcpu *pcpu;
-       u64 old, get;
-       u64 sectors_available;
-       int ret;
-
-       percpu_down_read(&c->mark_lock);
-       preempt_disable();
-       pcpu = this_cpu_ptr(c->pcpu);
-
-       if (sectors <= pcpu->sectors_available)
-               goto out;
-
-       old = atomic64_read(&c->sectors_available);
-       do {
-               get = min((u64) sectors + SECTORS_CACHE, old);
-
-               if (get < sectors) {
-                       preempt_enable();
-                       goto recalculate;
-               }
-       } while (!atomic64_try_cmpxchg(&c->sectors_available,
-                                      &old, old - get));
-
-       pcpu->sectors_available         += get;
-
-out:
-       pcpu->sectors_available         -= sectors;
-       this_cpu_add(*c->online_reserved, sectors);
-       res->sectors                    += sectors;
-
-       preempt_enable();
-       percpu_up_read(&c->mark_lock);
-       return 0;
-
-recalculate:
-       mutex_lock(&c->sectors_available_lock);
-
-       percpu_u64_set(&c->pcpu->sectors_available, 0);
-       sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
-
-       if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL))
-               sectors = min(sectors, sectors_available);
-
-       if (sectors <= sectors_available ||
-           (flags & BCH_DISK_RESERVATION_NOFAIL)) {
-               atomic64_set(&c->sectors_available,
-                            max_t(s64, 0, sectors_available - sectors));
-               this_cpu_add(*c->online_reserved, sectors);
-               res->sectors                    += sectors;
-               ret = 0;
-       } else {
-               atomic64_set(&c->sectors_available, sectors_available);
-               ret = bch_err_throw(c, ENOSPC_disk_reservation);
-       }
-
-       mutex_unlock(&c->sectors_available_lock);
-       percpu_up_read(&c->mark_lock);
-
-       return ret;
-}
-
-/* Startup/shutdown: */
-
-void bch2_buckets_nouse_free(struct bch_fs *c)
-{
-       for_each_member_device(c, ca) {
-               kvfree_rcu_mightsleep(ca->buckets_nouse);
-               ca->buckets_nouse = NULL;
-       }
-}
-
-int bch2_buckets_nouse_alloc(struct bch_fs *c)
-{
-       for_each_member_device(c, ca) {
-               BUG_ON(ca->buckets_nouse);
-
-               ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
-                                           sizeof(unsigned long),
-                                           GFP_KERNEL|__GFP_ZERO);
-               if (!ca->buckets_nouse) {
-                       bch2_dev_put(ca);
-                       return bch_err_throw(c, ENOMEM_buckets_nouse);
-               }
-       }
-
-       return 0;
-}
-
-static void bucket_gens_free_rcu(struct rcu_head *rcu)
-{
-       struct bucket_gens *buckets =
-               container_of(rcu, struct bucket_gens, rcu);
-
-       kvfree(buckets);
-}
-
-int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
-{
-       struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
-       bool resize = ca->bucket_gens != NULL;
-       int ret;
-
-       if (resize)
-               lockdep_assert_held(&c->state_lock);
-
-       if (resize && ca->buckets_nouse)
-               return bch_err_throw(c, no_resize_with_buckets_nouse);
-
-       bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
-                                   GFP_KERNEL|__GFP_ZERO);
-       if (!bucket_gens) {
-               ret = bch_err_throw(c, ENOMEM_bucket_gens);
-               goto err;
-       }
-
-       bucket_gens->first_bucket = ca->mi.first_bucket;
-       bucket_gens->nbuckets   = nbuckets;
-       bucket_gens->nbuckets_minus_first =
-               bucket_gens->nbuckets - bucket_gens->first_bucket;
-
-       old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
-
-       if (resize) {
-               u64 copy = min(bucket_gens->nbuckets,
-                              old_bucket_gens->nbuckets);
-               memcpy(bucket_gens->b,
-                      old_bucket_gens->b,
-                      sizeof(bucket_gens->b[0]) * copy);
-       }
-
-       ret =   bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch,
-                                         ca->mi.nbuckets, nbuckets) ?:
-               bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty,
-                                         ca->mi.nbuckets, nbuckets);
-
-       rcu_assign_pointer(ca->bucket_gens, bucket_gens);
-       bucket_gens     = old_bucket_gens;
-
-       nbuckets = ca->mi.nbuckets;
-
-       ret = 0;
-err:
-       if (bucket_gens)
-               call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
-
-       return ret;
-}
-
-void bch2_dev_buckets_free(struct bch_dev *ca)
-{
-       kvfree(ca->buckets_nouse);
-       kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
-       free_percpu(ca->usage);
-}
-
-int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
-       ca->usage = alloc_percpu(struct bch_dev_usage_full);
-       if (!ca->usage)
-               return bch_err_throw(c, ENOMEM_usage_init);
-
-       return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
-}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
deleted file mode 100644 (file)
index 49a3807..0000000
+++ /dev/null
@@ -1,369 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#ifndef _BUCKETS_H
-#define _BUCKETS_H
-
-#include "buckets_types.h"
-#include "extents.h"
-#include "sb-members.h"
-
-static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
-       return div_u64(s, ca->mi.bucket_size);
-}
-
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
-       return ((sector_t) b) * ca->mi.bucket_size;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
-       u32 remainder;
-
-       div_u64_rem(s, ca->mi.bucket_size, &remainder);
-       return remainder;
-}
-
-static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset)
-{
-       return div_u64_rem(s, ca->mi.bucket_size, offset);
-}
-
-#define for_each_bucket(_b, _buckets)                          \
-       for (_b = (_buckets)->b + (_buckets)->first_bucket;     \
-            _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
-
-static inline void bucket_unlock(struct bucket *b)
-{
-       BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
-
-       clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
-       smp_mb__after_atomic();
-       wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
-}
-
-static inline void bucket_lock(struct bucket *b)
-{
-       wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
-                        TASK_UNINTERRUPTIBLE);
-}
-
-static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
-{
-       return bucket_valid(ca, b)
-               ? genradix_ptr(&ca->buckets_gc, b)
-               : NULL;
-}
-
-static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
-{
-       return rcu_dereference_check(ca->bucket_gens,
-                                    lockdep_is_held(&ca->fs->state_lock));
-}
-
-static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
-{
-       struct bucket_gens *gens = bucket_gens(ca);
-
-       if (b - gens->first_bucket >= gens->nbuckets_minus_first)
-               return NULL;
-       return gens->b + b;
-}
-
-static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
-{
-       u8 *gen = bucket_gen(ca, b);
-       return gen ? *gen : -1;
-}
-
-static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
-{
-       guard(rcu)();
-       return bucket_gen_get_rcu(ca, b);
-}
-
-static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
-                                  const struct bch_extent_ptr *ptr)
-{
-       return sector_to_bucket(ca, ptr->offset);
-}
-
-static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca,
-                                        const struct bch_extent_ptr *ptr)
-{
-       return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
-}
-
-static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca,
-                                               const struct bch_extent_ptr *ptr,
-                                               u32 *bucket_offset)
-{
-       return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
-}
-
-static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
-                                          const struct bch_extent_ptr *ptr)
-{
-       return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
-}
-
-static inline enum bch_data_type ptr_data_type(const struct bkey *k,
-                                              const struct bch_extent_ptr *ptr)
-{
-       if (bkey_is_btree_ptr(k))
-               return BCH_DATA_btree;
-
-       return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
-}
-
-static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
-       EBUG_ON(sectors < 0);
-
-       return crc_is_compressed(p.crc)
-               ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
-                                  p.crc.uncompressed_size)
-               : sectors;
-}
-
-static inline int gen_cmp(u8 a, u8 b)
-{
-       return (s8) (a - b);
-}
-
-static inline int gen_after(u8 a, u8 b)
-{
-       return max(0, gen_cmp(a, b));
-}
-
-static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
-       int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr));
-       return gen < 0 ? gen : gen_after(gen, ptr->gen);
-}
-
-/**
- * dev_ptr_stale() - check if a pointer points into a bucket that has been
- * invalidated.
- */
-static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
-       guard(rcu)();
-       return dev_ptr_stale_rcu(ca, ptr);
-}
-
-/* Device usage: */
-
-void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
-static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
-{
-       struct bch_dev_usage ret;
-
-       bch2_dev_usage_read_fast(ca, &ret);
-       return ret;
-}
-
-void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *);
-static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca)
-{
-       struct bch_dev_usage_full ret;
-
-       bch2_dev_usage_full_read_fast(ca, &ret);
-       return ret;
-}
-
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *);
-
-static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
-{
-       s64 reserved = 0;
-
-       switch (watermark) {
-       case BCH_WATERMARK_NR:
-               BUG();
-       case BCH_WATERMARK_stripe:
-               reserved += ca->mi.nbuckets >> 6;
-               fallthrough;
-       case BCH_WATERMARK_normal:
-               reserved += ca->mi.nbuckets >> 6;
-               fallthrough;
-       case BCH_WATERMARK_copygc:
-               reserved += ca->nr_btree_reserve;
-               fallthrough;
-       case BCH_WATERMARK_btree:
-               reserved += ca->nr_btree_reserve;
-               fallthrough;
-       case BCH_WATERMARK_btree_copygc:
-       case BCH_WATERMARK_reclaim:
-       case BCH_WATERMARK_interior_updates:
-               break;
-       }
-
-       return reserved;
-}
-
-static inline u64 dev_buckets_free(struct bch_dev *ca,
-                                  struct bch_dev_usage usage,
-                                  enum bch_watermark watermark)
-{
-       return max_t(s64, 0,
-                    usage.buckets[BCH_DATA_free]-
-                    ca->nr_open_buckets -
-                    bch2_dev_buckets_reserved(ca, watermark));
-}
-
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
-                                         struct bch_dev_usage usage,
-                                         enum bch_watermark watermark)
-{
-       return max_t(s64, 0,
-                      usage.buckets[BCH_DATA_free]
-                    + usage.buckets[BCH_DATA_cached]
-                    + usage.buckets[BCH_DATA_need_gc_gens]
-                    + usage.buckets[BCH_DATA_need_discard]
-                    - ca->nr_open_buckets
-                    - bch2_dev_buckets_reserved(ca, watermark));
-}
-
-static inline u64 dev_buckets_available(struct bch_dev *ca,
-                                       enum bch_watermark watermark)
-{
-       return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark);
-}
-
-/* Filesystem usage: */
-
-struct bch_fs_usage_short
-bch2_fs_usage_read_short(struct bch_fs *);
-
-int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
-                          struct bkey_s_c, const struct bch_extent_ptr *,
-                          s64, enum bch_data_type, u8, u8, u32 *);
-
-int bch2_check_fix_ptrs(struct btree_trans *,
-                       enum btree_id, unsigned, struct bkey_s_c,
-                       enum btree_iter_update_trigger_flags);
-
-int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
-                       struct bkey_s_c, struct bkey_s,
-                       enum btree_iter_update_trigger_flags);
-int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
-                         struct bkey_s_c, struct bkey_s,
-                         enum btree_iter_update_trigger_flags);
-
-#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
-({                                                                                             \
-       int ret = 0;                                                                            \
-                                                                                               \
-       if (_old.k->type)                                                                       \
-               ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert);     \
-       if (!ret && _new.k->type)                                                               \
-               ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\
-       ret;                                                                                    \
-})
-
-void bch2_trans_account_disk_usage_change(struct btree_trans *);
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
-                                   enum bch_data_type, unsigned,
-                                   enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *,
-                                   enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
-                                   enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sbs(struct bch_fs *);
-
-bool bch2_is_superblock_bucket(struct bch_dev *, u64);
-
-static inline const char *bch2_data_type_str(enum bch_data_type type)
-{
-       return type < BCH_DATA_NR
-               ? __bch2_data_types[type]
-               : "(invalid data type)";
-}
-
-/* disk reservations: */
-
-static inline void bch2_disk_reservation_put(struct bch_fs *c,
-                                            struct disk_reservation *res)
-{
-       if (res->sectors) {
-               this_cpu_sub(*c->online_reserved, res->sectors);
-               res->sectors = 0;
-       }
-}
-
-enum bch_reservation_flags {
-       BCH_DISK_RESERVATION_NOFAIL     = 1 << 0,
-       BCH_DISK_RESERVATION_PARTIAL    = 1 << 1,
-};
-
-int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *,
-                               u64, enum bch_reservation_flags);
-
-static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
-                                           u64 sectors, enum bch_reservation_flags flags)
-{
-#ifdef __KERNEL__
-       u64 old, new;
-
-       old = this_cpu_read(c->pcpu->sectors_available);
-       do {
-               if (sectors > old)
-                       return __bch2_disk_reservation_add(c, res, sectors, flags);
-
-               new = old - sectors;
-       } while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new));
-
-       this_cpu_add(*c->online_reserved, sectors);
-       res->sectors                    += sectors;
-       return 0;
-#else
-       return __bch2_disk_reservation_add(c, res, sectors, flags);
-#endif
-}
-
-static inline struct disk_reservation
-bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
-{
-       return (struct disk_reservation) {
-               .sectors        = 0,
-#if 0
-               /* not used yet: */
-               .gen            = c->capacity_gen,
-#endif
-               .nr_replicas    = nr_replicas,
-       };
-}
-
-static inline int bch2_disk_reservation_get(struct bch_fs *c,
-                                           struct disk_reservation *res,
-                                           u64 sectors, unsigned nr_replicas,
-                                           int flags)
-{
-       *res = bch2_disk_reservation_init(c, nr_replicas);
-
-       return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
-}
-
-#define RESERVE_FACTOR 6
-
-static inline u64 avail_factor(u64 r)
-{
-       return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
-}
-
-void bch2_buckets_nouse_free(struct bch_fs *);
-int bch2_buckets_nouse_alloc(struct bch_fs *);
-
-int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
-void bch2_dev_buckets_free(struct bch_dev *);
-int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
-
-#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
deleted file mode 100644 (file)
index 0aed250..0000000
+++ /dev/null
@@ -1,100 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_TYPES_H
-#define _BUCKETS_TYPES_H
-
-#include "bcachefs_format.h"
-#include "util.h"
-
-#define BUCKET_JOURNAL_SEQ_BITS                16
-
-/*
- * Ugly hack alert:
- *
- * We need to cram a spinlock in a single byte, because that's what we have left
- * in struct bucket, and we care about the size of these - during fsck, we need
- * in memory state for every single bucket on every device.
- *
- * We used to do
- *   while (xchg(&b->lock, 1) cpu_relax();
- * but, it turns out not all architectures support xchg on a single byte.
- *
- * So now we use bit_spin_lock(), with fun games since we can't burn a whole
- * ulong for this - we just need to make sure the lock bit always ends up in the
- * first byte.
- */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define BUCKET_LOCK_BITNR      0
-#else
-#define BUCKET_LOCK_BITNR      (BITS_PER_LONG - 1)
-#endif
-
-union ulong_byte_assert {
-       ulong   ulong;
-       u8      byte;
-};
-
-struct bucket {
-       u8                      lock;
-       u8                      gen_valid:1;
-       u8                      data_type:7;
-       u8                      gen;
-       u8                      stripe_redundancy;
-       u32                     stripe;
-       u32                     dirty_sectors;
-       u32                     cached_sectors;
-       u32                     stripe_sectors;
-} __aligned(sizeof(long));
-
-struct bucket_gens {
-       struct rcu_head         rcu;
-       u16                     first_bucket;
-       size_t                  nbuckets;
-       size_t                  nbuckets_minus_first;
-       u8                      b[] __counted_by(nbuckets);
-};
-
-/* Only info on bucket countns: */
-struct bch_dev_usage {
-       u64                     buckets[BCH_DATA_NR];
-};
-
-struct bch_dev_usage_full {
-       struct bch_dev_usage_type {
-               u64             buckets;
-               u64             sectors; /* _compressed_ sectors: */
-               /*
-                * XXX
-                * Why do we have this? Isn't it just buckets * bucket_size -
-                * sectors?
-                */
-               u64             fragmented;
-       }                       d[BCH_DATA_NR];
-};
-
-struct bch_fs_usage_base {
-       u64                     hidden;
-       u64                     btree;
-       u64                     data;
-       u64                     cached;
-       u64                     reserved;
-       u64                     nr_inodes;
-};
-
-struct bch_fs_usage_short {
-       u64                     capacity;
-       u64                     used;
-       u64                     free;
-       u64                     nr_inodes;
-};
-
-/*
- * A reservation for space on disk:
- */
-struct disk_reservation {
-       u64                     sectors;
-       u32                     gen;
-       unsigned                nr_replicas;
-};
-
-#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
deleted file mode 100644 (file)
index 832eff9..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "buckets_waiting_for_journal.h"
-#include <linux/hash.h>
-#include <linux/random.h>
-
-static inline struct bucket_hashed *
-bucket_hash(struct buckets_waiting_for_journal_table *t,
-           unsigned hash_seed_idx, u64 dev_bucket)
-{
-       return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
-}
-
-static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
-{
-       unsigned i;
-
-       t->bits = bits;
-       for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
-               get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
-       memset(t->d, 0, sizeof(t->d[0]) << t->bits);
-}
-
-u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b,
-                                 unsigned dev, u64 bucket)
-{
-       struct buckets_waiting_for_journal_table *t;
-       u64 dev_bucket = (u64) dev << 56 | bucket;
-       u64 ret = 0;
-
-       mutex_lock(&b->lock);
-       t = b->t;
-
-       for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
-               struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
-
-               if (h->dev_bucket == dev_bucket) {
-                       ret = h->journal_seq;
-                       break;
-               }
-       }
-
-       mutex_unlock(&b->lock);
-
-       return ret;
-}
-
-static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
-                               struct bucket_hashed *new,
-                               u64 flushed_seq)
-{
-       struct bucket_hashed *last_evicted = NULL;
-       unsigned tries, i;
-
-       for (tries = 0; tries < 10; tries++) {
-               struct bucket_hashed *old, *victim = NULL;
-
-               for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
-                       old = bucket_hash(t, i, new->dev_bucket);
-
-                       if (old->dev_bucket == new->dev_bucket ||
-                           old->journal_seq <= flushed_seq) {
-                               *old = *new;
-                               return true;
-                       }
-
-                       if (last_evicted != old)
-                               victim = old;
-               }
-
-               /* hashed to same slot 3 times: */
-               if (!victim)
-                       break;
-
-               /* Failed to find an empty slot: */
-               swap(*new, *victim);
-               last_evicted = victim;
-       }
-
-       return false;
-}
-
-int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
-                                        u64 flushed_seq,
-                                        unsigned dev, u64 bucket,
-                                        u64 journal_seq)
-{
-       struct buckets_waiting_for_journal_table *t, *n;
-       struct bucket_hashed tmp, new = {
-               .dev_bucket     = (u64) dev << 56 | bucket,
-               .journal_seq    = journal_seq,
-       };
-       size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0;
-       int ret = 0;
-
-       mutex_lock(&b->lock);
-
-       if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
-               goto out;
-
-       t = b->t;
-       size = 1UL << t->bits;
-       for (i = 0; i < size; i++)
-               nr_elements += t->d[i].journal_seq > flushed_seq;
-
-       new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
-realloc:
-       n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
-       if (!n) {
-               struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal);
-               ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set);
-               goto out;
-       }
-
-retry_rehash:
-       if (nr_rehashes_this_size == 3) {
-               new_bits++;
-               nr_rehashes_this_size = 0;
-               kvfree(n);
-               goto realloc;
-       }
-
-       nr_rehashes++;
-       nr_rehashes_this_size++;
-
-       bucket_table_init(n, new_bits);
-
-       tmp = new;
-       BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
-
-       for (i = 0; i < 1UL << t->bits; i++) {
-               if (t->d[i].journal_seq <= flushed_seq)
-                       continue;
-
-               tmp = t->d[i];
-               if (!bucket_table_insert(n, &tmp, flushed_seq))
-                       goto retry_rehash;
-       }
-
-       b->t = n;
-       kvfree(t);
-
-       pr_debug("took %zu rehashes, table at %zu/%lu elements",
-                nr_rehashes, nr_elements, 1UL << b->t->bits);
-out:
-       mutex_unlock(&b->lock);
-
-       return ret;
-}
-
-void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
-{
-       struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
-
-       kvfree(b->t);
-}
-
-#define INITIAL_TABLE_BITS             3
-
-int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
-{
-       struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
-
-       mutex_init(&b->lock);
-
-       b->t = kvmalloc(sizeof(*b->t) +
-                       (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
-       if (!b->t)
-               return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
-
-       bucket_table_init(b->t, INITIAL_TABLE_BITS);
-       return 0;
-}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
deleted file mode 100644 (file)
index 365619c..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
-#define _BUCKETS_WAITING_FOR_JOURNAL_H
-
-#include "buckets_waiting_for_journal_types.h"
-
-u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *,
-                                 unsigned, u64);
-int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
-                                        u64, unsigned, u64, u64);
-
-void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
-int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
-
-#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
deleted file mode 100644 (file)
index e593db0..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
-#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
-
-#include <linux/siphash.h>
-
-struct bucket_hashed {
-       u64                     dev_bucket;
-       u64                     journal_seq;
-};
-
-struct buckets_waiting_for_journal_table {
-       unsigned                bits;
-       u64                     hash_seeds[3];
-       struct bucket_hashed    d[];
-};
-
-struct buckets_waiting_for_journal {
-       struct mutex            lock;
-       struct buckets_waiting_for_journal_table *t;
-};
-
-#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
deleted file mode 100644 (file)
index 5ea89aa..0000000
+++ /dev/null
@@ -1,843 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_CHARDEV
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "disk_accounting.h"
-#include "fsck.h"
-#include "journal.h"
-#include "move.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-counters.h"
-#include "super-io.h"
-#include "thread_with_file.h"
-
-#include <linux/cdev.h>
-#include <linux/device.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/major.h>
-#include <linux/sched/task.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-/* returns with ref on ca->ref */
-static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
-                                         unsigned flags)
-{
-       struct bch_dev *ca;
-
-       if (flags & BCH_BY_INDEX) {
-               if (dev >= c->sb.nr_devices)
-                       return ERR_PTR(-EINVAL);
-
-               ca = bch2_dev_tryget_noerror(c, dev);
-               if (!ca)
-                       return ERR_PTR(-EINVAL);
-       } else {
-               char *path;
-
-               path = strndup_user((const char __user *)
-                                   (unsigned long) dev, PATH_MAX);
-               if (IS_ERR(path))
-                       return ERR_CAST(path);
-
-               ca = bch2_dev_lookup(c, path);
-               kfree(path);
-       }
-
-       return ca;
-}
-
-#if 0
-static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
-{
-       struct bch_ioctl_assemble arg;
-       struct bch_fs *c;
-       u64 *user_devs = NULL;
-       char **devs = NULL;
-       unsigned i;
-       int ret = -EFAULT;
-
-       if (copy_from_user(&arg, user_arg, sizeof(arg)))
-               return -EFAULT;
-
-       if (arg.flags || arg.pad)
-               return -EINVAL;
-
-       user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
-       if (!user_devs)
-               return -ENOMEM;
-
-       devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
-
-       if (copy_from_user(user_devs, user_arg->devs,
-                          sizeof(u64) * arg.nr_devs))
-               goto err;
-
-       for (i = 0; i < arg.nr_devs; i++) {
-               devs[i] = strndup_user((const char __user *)(unsigned long)
-                                      user_devs[i],
-                                      PATH_MAX);
-               ret= PTR_ERR_OR_ZERO(devs[i]);
-               if (ret)
-                       goto err;
-       }
-
-       c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
-       ret = PTR_ERR_OR_ZERO(c);
-       if (!ret)
-               closure_put(&c->cl);
-err:
-       if (devs)
-               for (i = 0; i < arg.nr_devs; i++)
-                       kfree(devs[i]);
-       kfree(devs);
-       return ret;
-}
-
-static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
-{
-       struct bch_ioctl_incremental arg;
-       const char *err;
-       char *path;
-
-       if (copy_from_user(&arg, user_arg, sizeof(arg)))
-               return -EFAULT;
-
-       if (arg.flags || arg.pad)
-               return -EINVAL;
-
-       path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-       ret = PTR_ERR_OR_ZERO(path);
-       if (ret)
-               return ret;
-
-       err = bch2_fs_open_incremental(path);
-       kfree(path);
-
-       if (err) {
-               pr_err("Could not register bcachefs devices: %s", err);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-#endif
-
-static long bch2_global_ioctl(unsigned cmd, void __user *arg)
-{
-       long ret;
-
-       switch (cmd) {
-#if 0
-       case BCH_IOCTL_ASSEMBLE:
-               return bch2_ioctl_assemble(arg);
-       case BCH_IOCTL_INCREMENTAL:
-               return bch2_ioctl_incremental(arg);
-#endif
-       case BCH_IOCTL_FSCK_OFFLINE: {
-               ret = bch2_ioctl_fsck_offline(arg);
-               break;
-       }
-       default:
-               ret = -ENOTTY;
-               break;
-       }
-
-       if (ret < 0)
-               ret = bch2_err_class(ret);
-       return ret;
-}
-
-static long bch2_ioctl_query_uuid(struct bch_fs *c,
-                       struct bch_ioctl_query_uuid __user *user_arg)
-{
-       return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid,
-                                   sizeof(c->sb.user_uuid));
-}
-
-#if 0
-static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
-{
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if (arg.flags || arg.pad)
-               return -EINVAL;
-
-       return bch2_fs_start(c);
-}
-
-static long bch2_ioctl_stop(struct bch_fs *c)
-{
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       bch2_fs_stop(c);
-       return 0;
-}
-#endif
-
-static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-       char *path;
-       int ret;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if (arg.flags || arg.pad)
-               return -EINVAL;
-
-       path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-       ret = PTR_ERR_OR_ZERO(path);
-       if (ret)
-               return ret;
-
-       ret = bch2_dev_add(c, path);
-       if (!IS_ERR(path))
-               kfree(path);
-
-       return ret;
-}
-
-static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-       struct bch_dev *ca;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
-                          BCH_FORCE_IF_METADATA_LOST|
-                          BCH_FORCE_IF_DEGRADED|
-                          BCH_BY_INDEX)) ||
-           arg.pad)
-               return -EINVAL;
-
-       ca = bch2_device_lookup(c, arg.dev, arg.flags);
-       if (IS_ERR(ca))
-               return PTR_ERR(ca);
-
-       return bch2_dev_remove(c, ca, arg.flags);
-}
-
-static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-       char *path;
-       int ret;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if (arg.flags || arg.pad)
-               return -EINVAL;
-
-       path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-       ret = PTR_ERR_OR_ZERO(path);
-       if (ret)
-               return ret;
-
-       ret = bch2_dev_online(c, path);
-       kfree(path);
-       return ret;
-}
-
-static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-       struct bch_dev *ca;
-       int ret;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
-                          BCH_FORCE_IF_METADATA_LOST|
-                          BCH_FORCE_IF_DEGRADED|
-                          BCH_BY_INDEX)) ||
-           arg.pad)
-               return -EINVAL;
-
-       ca = bch2_device_lookup(c, arg.dev, arg.flags);
-       if (IS_ERR(ca))
-               return PTR_ERR(ca);
-
-       ret = bch2_dev_offline(c, ca, arg.flags);
-       bch2_dev_put(ca);
-       return ret;
-}
-
-static long bch2_ioctl_disk_set_state(struct bch_fs *c,
-                       struct bch_ioctl_disk_set_state arg)
-{
-       struct bch_dev *ca;
-       int ret;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
-                          BCH_FORCE_IF_METADATA_LOST|
-                          BCH_FORCE_IF_DEGRADED|
-                          BCH_BY_INDEX)) ||
-           arg.pad[0] || arg.pad[1] || arg.pad[2] ||
-           arg.new_state >= BCH_MEMBER_STATE_NR)
-               return -EINVAL;
-
-       ca = bch2_device_lookup(c, arg.dev, arg.flags);
-       if (IS_ERR(ca))
-               return PTR_ERR(ca);
-
-       ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
-       if (ret)
-               bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
-
-       bch2_dev_put(ca);
-       return ret;
-}
-
-struct bch_data_ctx {
-       struct thread_with_file         thr;
-
-       struct bch_fs                   *c;
-       struct bch_ioctl_data           arg;
-       struct bch_move_stats           stats;
-};
-
-static int bch2_data_thread(void *arg)
-{
-       struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
-
-       ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
-       if (ctx->thr.ret == -BCH_ERR_device_offline)
-               ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
-       else {
-               ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
-               ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done;
-       }
-       enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data);
-       return 0;
-}
-
-static int bch2_data_job_release(struct inode *inode, struct file *file)
-{
-       struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
-
-       bch2_thread_with_file_exit(&ctx->thr);
-       kfree(ctx);
-       return 0;
-}
-
-static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
-                                 size_t len, loff_t *ppos)
-{
-       struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
-       struct bch_fs *c = ctx->c;
-       struct bch_ioctl_data_event e = {
-               .type                           = BCH_DATA_EVENT_PROGRESS,
-               .ret                            = ctx->stats.ret,
-               .p.data_type                    = ctx->stats.data_type,
-               .p.btree_id                     = ctx->stats.pos.btree,
-               .p.pos                          = ctx->stats.pos.pos,
-               .p.sectors_done                 = atomic64_read(&ctx->stats.sectors_seen),
-               .p.sectors_error_corrected      = atomic64_read(&ctx->stats.sectors_error_corrected),
-               .p.sectors_error_uncorrected    = atomic64_read(&ctx->stats.sectors_error_uncorrected),
-       };
-
-       if (ctx->arg.op == BCH_DATA_OP_scrub) {
-               struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
-               if (ca) {
-                       struct bch_dev_usage_full u;
-                       bch2_dev_usage_full_read_fast(ca, &u);
-                       for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
-                               if (ctx->arg.scrub.data_types & BIT(i))
-                                       e.p.sectors_total += u.d[i].sectors;
-                       bch2_dev_put(ca);
-               }
-       } else {
-               e.p.sectors_total       = bch2_fs_usage_read_short(c).used;
-       }
-
-       if (len < sizeof(e))
-               return -EINVAL;
-
-       return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e);
-}
-
-static const struct file_operations bcachefs_data_ops = {
-       .release        = bch2_data_job_release,
-       .read           = bch2_data_job_read,
-};
-
-static long bch2_ioctl_data(struct bch_fs *c,
-                           struct bch_ioctl_data arg)
-{
-       struct bch_data_ctx *ctx;
-       int ret;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data))
-               return -EROFS;
-
-       if (!capable(CAP_SYS_ADMIN)) {
-               ret = -EPERM;
-               goto put_ref;
-       }
-
-       if (arg.op >= BCH_DATA_OP_NR || arg.flags) {
-               ret = -EINVAL;
-               goto put_ref;
-       }
-
-       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-       if (!ctx) {
-               ret = -ENOMEM;
-               goto put_ref;
-       }
-
-       ctx->c = c;
-       ctx->arg = arg;
-
-       ret = bch2_run_thread_with_file(&ctx->thr,
-                       &bcachefs_data_ops,
-                       bch2_data_thread);
-       if (ret < 0)
-               goto cleanup;
-       return ret;
-cleanup:
-       kfree(ctx);
-put_ref:
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data);
-       return ret;
-}
-
-static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c,
-                               struct bch_ioctl_fs_usage __user *user_arg)
-{
-       struct bch_ioctl_fs_usage arg = {};
-       darray_char replicas = {};
-       u32 replica_entries_bytes;
-       int ret = 0;
-
-       if (!test_bit(BCH_FS_started, &c->flags))
-               return -EINVAL;
-
-       if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
-               return -EFAULT;
-
-       ret   = bch2_fs_replicas_usage_read(c, &replicas) ?:
-               (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?:
-               copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr);
-       if (ret)
-               goto err;
-
-       struct bch_fs_usage_short u = bch2_fs_usage_read_short(c);
-       arg.capacity            = c->capacity;
-       arg.used                = u.used;
-       arg.online_reserved     = percpu_u64_get(c->online_reserved);
-       arg.replica_entries_bytes = replicas.nr;
-
-       for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
-               struct disk_accounting_pos k;
-               disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i);
-
-               bch2_accounting_mem_read(c,
-                                        disk_accounting_pos_to_bpos(&k),
-                                        &arg.persistent_reserved[i], 1);
-       }
-
-       ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-err:
-       darray_exit(&replicas);
-       return ret;
-}
-
-static long bch2_ioctl_query_accounting(struct bch_fs *c,
-                       struct bch_ioctl_query_accounting __user *user_arg)
-{
-       struct bch_ioctl_query_accounting arg;
-       darray_char accounting = {};
-       int ret = 0;
-
-       if (!test_bit(BCH_FS_started, &c->flags))
-               return -EINVAL;
-
-       ret   = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?:
-               bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?:
-               (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?:
-               copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr);
-       if (ret)
-               goto err;
-
-       arg.capacity            = c->capacity;
-       arg.used                = bch2_fs_usage_read_short(c).used;
-       arg.online_reserved     = percpu_u64_get(c->online_reserved);
-       arg.accounting_u64s     = accounting.nr / sizeof(u64);
-
-       ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-err:
-       darray_exit(&accounting);
-       return ret;
-}
-
-/* obsolete, didn't allow for new data types: */
-static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c,
-                                struct bch_ioctl_dev_usage __user *user_arg)
-{
-       struct bch_ioctl_dev_usage arg;
-       struct bch_dev_usage_full src;
-       struct bch_dev *ca;
-       unsigned i;
-
-       if (!test_bit(BCH_FS_started, &c->flags))
-               return -EINVAL;
-
-       if (copy_from_user(&arg, user_arg, sizeof(arg)))
-               return -EFAULT;
-
-       if ((arg.flags & ~BCH_BY_INDEX) ||
-           arg.pad[0] ||
-           arg.pad[1] ||
-           arg.pad[2])
-               return -EINVAL;
-
-       ca = bch2_device_lookup(c, arg.dev, arg.flags);
-       if (IS_ERR(ca))
-               return PTR_ERR(ca);
-
-       src = bch2_dev_usage_full_read(ca);
-
-       arg.state               = ca->mi.state;
-       arg.bucket_size         = ca->mi.bucket_size;
-       arg.nr_buckets          = ca->mi.nbuckets - ca->mi.first_bucket;
-
-       for (i = 0; i < ARRAY_SIZE(arg.d); i++) {
-               arg.d[i].buckets        = src.d[i].buckets;
-               arg.d[i].sectors        = src.d[i].sectors;
-               arg.d[i].fragmented     = src.d[i].fragmented;
-       }
-
-       bch2_dev_put(ca);
-
-       return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-}
-
-static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
-                                struct bch_ioctl_dev_usage_v2 __user *user_arg)
-{
-       struct bch_ioctl_dev_usage_v2 arg;
-       struct bch_dev_usage_full src;
-       struct bch_dev *ca;
-       int ret = 0;
-
-       if (!test_bit(BCH_FS_started, &c->flags))
-               return -EINVAL;
-
-       if (copy_from_user(&arg, user_arg, sizeof(arg)))
-               return -EFAULT;
-
-       if ((arg.flags & ~BCH_BY_INDEX) ||
-           arg.pad[0] ||
-           arg.pad[1] ||
-           arg.pad[2])
-               return -EINVAL;
-
-       ca = bch2_device_lookup(c, arg.dev, arg.flags);
-       if (IS_ERR(ca))
-               return PTR_ERR(ca);
-
-       src = bch2_dev_usage_full_read(ca);
-
-       arg.state               = ca->mi.state;
-       arg.bucket_size         = ca->mi.bucket_size;
-       arg.nr_data_types       = min(arg.nr_data_types, BCH_DATA_NR);
-       arg.nr_buckets          = ca->mi.nbuckets - ca->mi.first_bucket;
-
-       ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-       if (ret)
-               goto err;
-
-       for (unsigned i = 0; i < arg.nr_data_types; i++) {
-               struct bch_ioctl_dev_usage_type t = {
-                       .buckets        = src.d[i].buckets,
-                       .sectors        = src.d[i].sectors,
-                       .fragmented     = src.d[i].fragmented,
-               };
-
-               ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t));
-               if (ret)
-                       goto err;
-       }
-err:
-       bch2_dev_put(ca);
-       return ret;
-}
-
-static long bch2_ioctl_read_super(struct bch_fs *c,
-                                 struct bch_ioctl_read_super arg)
-{
-       struct bch_dev *ca = NULL;
-       struct bch_sb *sb;
-       int ret = 0;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
-           arg.pad)
-               return -EINVAL;
-
-       mutex_lock(&c->sb_lock);
-
-       if (arg.flags & BCH_READ_DEV) {
-               ca = bch2_device_lookup(c, arg.dev, arg.flags);
-               ret = PTR_ERR_OR_ZERO(ca);
-               if (ret)
-                       goto err_unlock;
-
-               sb = ca->disk_sb.sb;
-       } else {
-               sb = c->disk_sb.sb;
-       }
-
-       if (vstruct_bytes(sb) > arg.size) {
-               ret = -ERANGE;
-               goto err;
-       }
-
-       ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
-                                  vstruct_bytes(sb));
-err:
-       bch2_dev_put(ca);
-err_unlock:
-       mutex_unlock(&c->sb_lock);
-       return ret;
-}
-
-static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
-                                   struct bch_ioctl_disk_get_idx arg)
-{
-       dev_t dev = huge_decode_dev(arg.dev);
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if (!dev)
-               return -EINVAL;
-
-       guard(rcu)();
-       for_each_online_member_rcu(c, ca)
-               if (ca->dev == dev)
-                       return ca->dev_idx;
-
-       return bch_err_throw(c, ENOENT_dev_idx_not_found);
-}
-
-static long bch2_ioctl_disk_resize(struct bch_fs *c,
-                                  struct bch_ioctl_disk_resize arg)
-{
-       struct bch_dev *ca;
-       int ret;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if ((arg.flags & ~BCH_BY_INDEX) ||
-           arg.pad)
-               return -EINVAL;
-
-       ca = bch2_device_lookup(c, arg.dev, arg.flags);
-       if (IS_ERR(ca))
-               return PTR_ERR(ca);
-
-       ret = bch2_dev_resize(c, ca, arg.nbuckets);
-
-       bch2_dev_put(ca);
-       return ret;
-}
-
-static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
-                                  struct bch_ioctl_disk_resize_journal arg)
-{
-       struct bch_dev *ca;
-       int ret;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if ((arg.flags & ~BCH_BY_INDEX) ||
-           arg.pad)
-               return -EINVAL;
-
-       if (arg.nbuckets > U32_MAX)
-               return -EINVAL;
-
-       ca = bch2_device_lookup(c, arg.dev, arg.flags);
-       if (IS_ERR(ca))
-               return PTR_ERR(ca);
-
-       ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
-
-       bch2_dev_put(ca);
-       return ret;
-}
-
-#define BCH_IOCTL(_name, _argtype)                                     \
-do {                                                                   \
-       _argtype i;                                                     \
-                                                                       \
-       if (copy_from_user(&i, arg, sizeof(i)))                         \
-               return -EFAULT;                                         \
-       ret = bch2_ioctl_##_name(c, i);                                 \
-       goto out;                                                       \
-} while (0)
-
-long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
-{
-       long ret;
-
-       switch (cmd) {
-       case BCH_IOCTL_QUERY_UUID:
-               return bch2_ioctl_query_uuid(c, arg);
-       case BCH_IOCTL_FS_USAGE:
-               return bch2_ioctl_fs_usage(c, arg);
-       case BCH_IOCTL_DEV_USAGE:
-               return bch2_ioctl_dev_usage(c, arg);
-       case BCH_IOCTL_DEV_USAGE_V2:
-               return bch2_ioctl_dev_usage_v2(c, arg);
-#if 0
-       case BCH_IOCTL_START:
-               BCH_IOCTL(start, struct bch_ioctl_start);
-       case BCH_IOCTL_STOP:
-               return bch2_ioctl_stop(c);
-#endif
-       case BCH_IOCTL_READ_SUPER:
-               BCH_IOCTL(read_super, struct bch_ioctl_read_super);
-       case BCH_IOCTL_DISK_GET_IDX:
-               BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
-       }
-
-       if (!test_bit(BCH_FS_started, &c->flags))
-               return -EINVAL;
-
-       switch (cmd) {
-       case BCH_IOCTL_DISK_ADD:
-               BCH_IOCTL(disk_add, struct bch_ioctl_disk);
-       case BCH_IOCTL_DISK_REMOVE:
-               BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
-       case BCH_IOCTL_DISK_ONLINE:
-               BCH_IOCTL(disk_online, struct bch_ioctl_disk);
-       case BCH_IOCTL_DISK_OFFLINE:
-               BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
-       case BCH_IOCTL_DISK_SET_STATE:
-               BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
-       case BCH_IOCTL_DATA:
-               BCH_IOCTL(data, struct bch_ioctl_data);
-       case BCH_IOCTL_DISK_RESIZE:
-               BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
-       case BCH_IOCTL_DISK_RESIZE_JOURNAL:
-               BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
-       case BCH_IOCTL_FSCK_ONLINE:
-               BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
-       case BCH_IOCTL_QUERY_ACCOUNTING:
-               return bch2_ioctl_query_accounting(c, arg);
-       case BCH_IOCTL_QUERY_COUNTERS:
-               return bch2_ioctl_query_counters(c, arg);
-       default:
-               return -ENOTTY;
-       }
-out:
-       if (ret < 0)
-               ret = bch2_err_class(ret);
-       return ret;
-}
-
-static DEFINE_IDR(bch_chardev_minor);
-
-static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
-{
-       unsigned minor = iminor(file_inode(filp));
-       struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
-       void __user *arg = (void __user *) v;
-
-       return c
-               ? bch2_fs_ioctl(c, cmd, arg)
-               : bch2_global_ioctl(cmd, arg);
-}
-
-static const struct file_operations bch_chardev_fops = {
-       .owner          = THIS_MODULE,
-       .unlocked_ioctl = bch2_chardev_ioctl,
-       .open           = nonseekable_open,
-};
-
-static int bch_chardev_major;
-static const struct class bch_chardev_class = {
-       .name = "bcachefs",
-};
-static struct device *bch_chardev;
-
-void bch2_fs_chardev_exit(struct bch_fs *c)
-{
-       if (!IS_ERR_OR_NULL(c->chardev))
-               device_unregister(c->chardev);
-       if (c->minor >= 0)
-               idr_remove(&bch_chardev_minor, c->minor);
-}
-
-int bch2_fs_chardev_init(struct bch_fs *c)
-{
-       c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
-       if (c->minor < 0)
-               return c->minor;
-
-       c->chardev = device_create(&bch_chardev_class, NULL,
-                                  MKDEV(bch_chardev_major, c->minor), c,
-                                  "bcachefs%u-ctl", c->minor);
-       if (IS_ERR(c->chardev))
-               return PTR_ERR(c->chardev);
-
-       return 0;
-}
-
-void bch2_chardev_exit(void)
-{
-       device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX));
-       class_unregister(&bch_chardev_class);
-       if (bch_chardev_major > 0)
-               unregister_chrdev(bch_chardev_major, "bcachefs");
-}
-
-int __init bch2_chardev_init(void)
-{
-       int ret;
-
-       bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
-       if (bch_chardev_major < 0)
-               return bch_chardev_major;
-
-       ret = class_register(&bch_chardev_class);
-       if (ret)
-               goto major_out;
-
-       bch_chardev = device_create(&bch_chardev_class, NULL,
-                                   MKDEV(bch_chardev_major, U8_MAX),
-                                   NULL, "bcachefs-ctl");
-       if (IS_ERR(bch_chardev)) {
-               ret = PTR_ERR(bch_chardev);
-               goto class_out;
-       }
-
-       return 0;
-
-class_out:
-       class_unregister(&bch_chardev_class);
-major_out:
-       unregister_chrdev(bch_chardev_major, "bcachefs-ctl");
-       return ret;
-}
-
-#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
deleted file mode 100644 (file)
index 0f563ca..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CHARDEV_H
-#define _BCACHEFS_CHARDEV_H
-
-#ifndef NO_BCACHEFS_FS
-
-long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
-
-void bch2_fs_chardev_exit(struct bch_fs *);
-int bch2_fs_chardev_init(struct bch_fs *);
-
-void bch2_chardev_exit(void);
-int __init bch2_chardev_init(void);
-
-#else
-
-static inline long bch2_fs_ioctl(struct bch_fs *c,
-                               unsigned cmd, void __user * arg)
-{
-       return -ENOTTY;
-}
-
-static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
-static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
-
-static inline void bch2_chardev_exit(void) {}
-static inline int __init bch2_chardev_init(void) { return 0; }
-
-#endif /* NO_BCACHEFS_FS */
-
-#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
deleted file mode 100644 (file)
index a6795e7..0000000
+++ /dev/null
@@ -1,698 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "checksum.h"
-#include "errcode.h"
-#include "error.h"
-#include "super.h"
-#include "super-io.h"
-
-#include <linux/crc32c.h>
-#include <linux/xxhash.h>
-#include <linux/key.h>
-#include <linux/random.h>
-#include <linux/ratelimit.h>
-#include <crypto/chacha.h>
-#include <crypto/poly1305.h>
-#include <keys/user-type.h>
-
-/*
- * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
- * it features page merging without having the checksum algorithm lose its state.
- * for native checksum aglorithms (like crc), a default seed value will do.
- * for hash-like algorithms, a state needs to be stored
- */
-
-struct bch2_checksum_state {
-       union {
-               u64 seed;
-               struct xxh64_state h64state;
-       };
-       unsigned int type;
-};
-
-static void bch2_checksum_init(struct bch2_checksum_state *state)
-{
-       switch (state->type) {
-       case BCH_CSUM_none:
-       case BCH_CSUM_crc32c:
-       case BCH_CSUM_crc64:
-               state->seed = 0;
-               break;
-       case BCH_CSUM_crc32c_nonzero:
-               state->seed = U32_MAX;
-               break;
-       case BCH_CSUM_crc64_nonzero:
-               state->seed = U64_MAX;
-               break;
-       case BCH_CSUM_xxhash:
-               xxh64_reset(&state->h64state, 0);
-               break;
-       default:
-               BUG();
-       }
-}
-
-static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
-{
-       switch (state->type) {
-       case BCH_CSUM_none:
-       case BCH_CSUM_crc32c:
-       case BCH_CSUM_crc64:
-               return state->seed;
-       case BCH_CSUM_crc32c_nonzero:
-               return state->seed ^ U32_MAX;
-       case BCH_CSUM_crc64_nonzero:
-               return state->seed ^ U64_MAX;
-       case BCH_CSUM_xxhash:
-               return xxh64_digest(&state->h64state);
-       default:
-               BUG();
-       }
-}
-
-static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
-{
-       switch (state->type) {
-       case BCH_CSUM_none:
-               return;
-       case BCH_CSUM_crc32c_nonzero:
-       case BCH_CSUM_crc32c:
-               state->seed = crc32c(state->seed, data, len);
-               break;
-       case BCH_CSUM_crc64_nonzero:
-       case BCH_CSUM_crc64:
-               state->seed = crc64_be(state->seed, data, len);
-               break;
-       case BCH_CSUM_xxhash:
-               xxh64_update(&state->h64state, data, len);
-               break;
-       default:
-               BUG();
-       }
-}
-
-static void bch2_chacha20_init(struct chacha_state *state,
-                              const struct bch_key *key, struct nonce nonce)
-{
-       u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)];
-
-       BUILD_BUG_ON(sizeof(key_words) != sizeof(*key));
-       memcpy(key_words, key, sizeof(key_words));
-       le32_to_cpu_array(key_words, ARRAY_SIZE(key_words));
-
-       BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE);
-       chacha_init(state, key_words, (const u8 *)nonce.d);
-
-       memzero_explicit(key_words, sizeof(key_words));
-}
-
-void bch2_chacha20(const struct bch_key *key, struct nonce nonce,
-                  void *data, size_t len)
-{
-       struct chacha_state state;
-
-       bch2_chacha20_init(&state, key, nonce);
-       chacha20_crypt(&state, data, data, len);
-       chacha_zeroize_state(&state);
-}
-
-static void bch2_poly1305_init(struct poly1305_desc_ctx *desc,
-                              struct bch_fs *c, struct nonce nonce)
-{
-       u8 key[POLY1305_KEY_SIZE] = { 0 };
-
-       nonce.d[3] ^= BCH_NONCE_POLY;
-
-       bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key));
-       poly1305_init(desc, key);
-}
-
-struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
-                             struct nonce nonce, const void *data, size_t len)
-{
-       switch (type) {
-       case BCH_CSUM_none:
-       case BCH_CSUM_crc32c_nonzero:
-       case BCH_CSUM_crc64_nonzero:
-       case BCH_CSUM_crc32c:
-       case BCH_CSUM_xxhash:
-       case BCH_CSUM_crc64: {
-               struct bch2_checksum_state state;
-
-               state.type = type;
-
-               bch2_checksum_init(&state);
-               bch2_checksum_update(&state, data, len);
-
-               return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
-       }
-
-       case BCH_CSUM_chacha20_poly1305_80:
-       case BCH_CSUM_chacha20_poly1305_128: {
-               struct poly1305_desc_ctx dctx;
-               u8 digest[POLY1305_DIGEST_SIZE];
-               struct bch_csum ret = { 0 };
-
-               bch2_poly1305_init(&dctx, c, nonce);
-               poly1305_update(&dctx, data, len);
-               poly1305_final(&dctx, digest);
-
-               memcpy(&ret, digest, bch_crc_bytes[type]);
-               return ret;
-       }
-       default:
-               return (struct bch_csum) {};
-       }
-}
-
-int bch2_encrypt(struct bch_fs *c, unsigned type,
-                 struct nonce nonce, void *data, size_t len)
-{
-       if (!bch2_csum_type_is_encryption(type))
-               return 0;
-
-       if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
-                                   c, "attempting to encrypt without encryption key"))
-               return bch_err_throw(c, no_encryption_key);
-
-       bch2_chacha20(&c->chacha20_key, nonce, data, len);
-       return 0;
-}
-
-static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
-                                          struct nonce nonce, struct bio *bio,
-                                          struct bvec_iter *iter)
-{
-       struct bio_vec bv;
-
-       switch (type) {
-       case BCH_CSUM_none:
-               return (struct bch_csum) { 0 };
-       case BCH_CSUM_crc32c_nonzero:
-       case BCH_CSUM_crc64_nonzero:
-       case BCH_CSUM_crc32c:
-       case BCH_CSUM_xxhash:
-       case BCH_CSUM_crc64: {
-               struct bch2_checksum_state state;
-
-               state.type = type;
-               bch2_checksum_init(&state);
-
-#ifdef CONFIG_HIGHMEM
-               __bio_for_each_segment(bv, bio, *iter, *iter) {
-                       void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
-
-                       bch2_checksum_update(&state, p, bv.bv_len);
-                       kunmap_local(p);
-               }
-#else
-               __bio_for_each_bvec(bv, bio, *iter, *iter)
-                       bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
-                               bv.bv_len);
-#endif
-               return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
-       }
-
-       case BCH_CSUM_chacha20_poly1305_80:
-       case BCH_CSUM_chacha20_poly1305_128: {
-               struct poly1305_desc_ctx dctx;
-               u8 digest[POLY1305_DIGEST_SIZE];
-               struct bch_csum ret = { 0 };
-
-               bch2_poly1305_init(&dctx, c, nonce);
-
-#ifdef CONFIG_HIGHMEM
-               __bio_for_each_segment(bv, bio, *iter, *iter) {
-                       void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
-
-                       poly1305_update(&dctx, p, bv.bv_len);
-                       kunmap_local(p);
-               }
-#else
-               __bio_for_each_bvec(bv, bio, *iter, *iter)
-                       poly1305_update(&dctx,
-                               page_address(bv.bv_page) + bv.bv_offset,
-                               bv.bv_len);
-#endif
-               poly1305_final(&dctx, digest);
-
-               memcpy(&ret, digest, bch_crc_bytes[type]);
-               return ret;
-       }
-       default:
-               return (struct bch_csum) {};
-       }
-}
-
-struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
-                                 struct nonce nonce, struct bio *bio)
-{
-       struct bvec_iter iter = bio->bi_iter;
-
-       return __bch2_checksum_bio(c, type, nonce, bio, &iter);
-}
-
-int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
-                    struct nonce nonce, struct bio *bio)
-{
-       struct bio_vec bv;
-       struct bvec_iter iter;
-       struct chacha_state chacha_state;
-       int ret = 0;
-
-       if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
-                                   c, "attempting to encrypt without encryption key"))
-               return bch_err_throw(c, no_encryption_key);
-
-       bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce);
-
-       bio_for_each_segment(bv, bio, iter) {
-               void *p;
-
-               /*
-                * chacha_crypt() assumes that the length is a multiple of
-                * CHACHA_BLOCK_SIZE on any non-final call.
-                */
-               if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) {
-                       bch_err_ratelimited(c, "bio not aligned for encryption");
-                       ret = -EIO;
-                       break;
-               }
-
-               p = bvec_kmap_local(&bv);
-               chacha20_crypt(&chacha_state, p, p, bv.bv_len);
-               kunmap_local(p);
-       }
-       chacha_zeroize_state(&chacha_state);
-       return ret;
-}
-
-struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
-                                   struct bch_csum b, size_t b_len)
-{
-       struct bch2_checksum_state state;
-
-       state.type = type;
-       bch2_checksum_init(&state);
-       state.seed = le64_to_cpu(a.lo);
-
-       BUG_ON(!bch2_checksum_mergeable(type));
-
-       while (b_len) {
-               unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
-
-               bch2_checksum_update(&state,
-                               page_address(ZERO_PAGE(0)), page_len);
-               b_len -= page_len;
-       }
-       a.lo = cpu_to_le64(bch2_checksum_final(&state));
-       a.lo ^= b.lo;
-       a.hi ^= b.hi;
-       return a;
-}
-
-int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
-                       struct bversion version,
-                       struct bch_extent_crc_unpacked crc_old,
-                       struct bch_extent_crc_unpacked *crc_a,
-                       struct bch_extent_crc_unpacked *crc_b,
-                       unsigned len_a, unsigned len_b,
-                       unsigned new_csum_type)
-{
-       struct bvec_iter iter = bio->bi_iter;
-       struct nonce nonce = extent_nonce(version, crc_old);
-       struct bch_csum merged = { 0 };
-       struct crc_split {
-               struct bch_extent_crc_unpacked  *crc;
-               unsigned                        len;
-               unsigned                        csum_type;
-               struct bch_csum                 csum;
-       } splits[3] = {
-               { crc_a, len_a, new_csum_type, { 0 }},
-               { crc_b, len_b, new_csum_type, { 0 } },
-               { NULL,  bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
-       }, *i;
-       bool mergeable = crc_old.csum_type == new_csum_type &&
-               bch2_checksum_mergeable(new_csum_type);
-       unsigned crc_nonce = crc_old.nonce;
-
-       BUG_ON(len_a + len_b > bio_sectors(bio));
-       BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
-       BUG_ON(crc_is_compressed(crc_old));
-       BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
-              bch2_csum_type_is_encryption(new_csum_type));
-
-       for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
-               iter.bi_size = i->len << 9;
-               if (mergeable || i->crc)
-                       i->csum = __bch2_checksum_bio(c, i->csum_type,
-                                                     nonce, bio, &iter);
-               else
-                       bio_advance_iter(bio, &iter, i->len << 9);
-               nonce = nonce_add(nonce, i->len << 9);
-       }
-
-       if (mergeable)
-               for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
-                       merged = bch2_checksum_merge(new_csum_type, merged,
-                                                    i->csum, i->len << 9);
-       else
-               merged = bch2_checksum_bio(c, crc_old.csum_type,
-                               extent_nonce(version, crc_old), bio);
-
-       if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
-               struct printbuf buf = PRINTBUF;
-               prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
-                          "  expected %0llx:%0llx got %0llx:%0llx (old type ",
-                          __func__,
-                          crc_old.csum.hi,
-                          crc_old.csum.lo,
-                          merged.hi,
-                          merged.lo);
-               bch2_prt_csum_type(&buf, crc_old.csum_type);
-               prt_str(&buf, " new type ");
-               bch2_prt_csum_type(&buf, new_csum_type);
-               prt_str(&buf, ")");
-               WARN_RATELIMIT(1, "%s", buf.buf);
-               printbuf_exit(&buf);
-               return bch_err_throw(c, recompute_checksum);
-       }
-
-       for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
-               if (i->crc)
-                       *i->crc = (struct bch_extent_crc_unpacked) {
-                               .csum_type              = i->csum_type,
-                               .compression_type       = crc_old.compression_type,
-                               .compressed_size        = i->len,
-                               .uncompressed_size      = i->len,
-                               .offset                 = 0,
-                               .live_size              = i->len,
-                               .nonce                  = crc_nonce,
-                               .csum                   = i->csum,
-                       };
-
-               if (bch2_csum_type_is_encryption(new_csum_type))
-                       crc_nonce += i->len;
-       }
-
-       return 0;
-}
-
-/* BCH_SB_FIELD_crypt: */
-
-static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                 enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-       if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
-               prt_printf(err, "wrong size (got %zu should be %zu)",
-                      vstruct_bytes(&crypt->field), sizeof(*crypt));
-               return -BCH_ERR_invalid_sb_crypt;
-       }
-
-       if (BCH_CRYPT_KDF_TYPE(crypt)) {
-               prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
-               return -BCH_ERR_invalid_sb_crypt;
-       }
-
-       return 0;
-}
-
-static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
-                                 struct bch_sb_field *f)
-{
-       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-       prt_printf(out, "KFD:               %llu\n", BCH_CRYPT_KDF_TYPE(crypt));
-       prt_printf(out, "scrypt n:          %llu\n", BCH_KDF_SCRYPT_N(crypt));
-       prt_printf(out, "scrypt r:          %llu\n", BCH_KDF_SCRYPT_R(crypt));
-       prt_printf(out, "scrypt p:          %llu\n", BCH_KDF_SCRYPT_P(crypt));
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-       .validate       = bch2_sb_crypt_validate,
-       .to_text        = bch2_sb_crypt_to_text,
-};
-
-#ifdef __KERNEL__
-static int __bch2_request_key(char *key_description, struct bch_key *key)
-{
-       struct key *keyring_key;
-       const struct user_key_payload *ukp;
-       int ret;
-
-       keyring_key = request_key(&key_type_user, key_description, NULL);
-       if (IS_ERR(keyring_key))
-               return PTR_ERR(keyring_key);
-
-       down_read(&keyring_key->sem);
-       ukp = dereference_key_locked(keyring_key);
-       if (ukp->datalen == sizeof(*key)) {
-               memcpy(key, ukp->data, ukp->datalen);
-               ret = 0;
-       } else {
-               ret = -EINVAL;
-       }
-       up_read(&keyring_key->sem);
-       key_put(keyring_key);
-
-       return ret;
-}
-#else
-#include <keyutils.h>
-
-static int __bch2_request_key(char *key_description, struct bch_key *key)
-{
-       key_serial_t key_id;
-
-       key_id = request_key("user", key_description, NULL,
-                            KEY_SPEC_SESSION_KEYRING);
-       if (key_id >= 0)
-               goto got_key;
-
-       key_id = request_key("user", key_description, NULL,
-                            KEY_SPEC_USER_KEYRING);
-       if (key_id >= 0)
-               goto got_key;
-
-       key_id = request_key("user", key_description, NULL,
-                            KEY_SPEC_USER_SESSION_KEYRING);
-       if (key_id >= 0)
-               goto got_key;
-
-       return -errno;
-got_key:
-
-       if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
-               return -1;
-
-       return 0;
-}
-
-#include "crypto.h"
-#endif
-
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
-{
-       struct printbuf key_description = PRINTBUF;
-       int ret;
-
-       prt_printf(&key_description, "bcachefs:");
-       pr_uuid(&key_description, sb->user_uuid.b);
-
-       ret = __bch2_request_key(key_description.buf, key);
-       printbuf_exit(&key_description);
-
-#ifndef __KERNEL__
-       if (ret) {
-               char *passphrase = read_passphrase("Enter passphrase: ");
-               struct bch_encrypted_key sb_key;
-
-               bch2_passphrase_check(sb, passphrase,
-                                     key, &sb_key);
-               ret = 0;
-       }
-#endif
-
-       /* stash with memfd, pass memfd fd to mount */
-
-       return ret;
-}
-
-#ifndef __KERNEL__
-int bch2_revoke_key(struct bch_sb *sb)
-{
-       key_serial_t key_id;
-       struct printbuf key_description = PRINTBUF;
-
-       prt_printf(&key_description, "bcachefs:");
-       pr_uuid(&key_description, sb->user_uuid.b);
-
-       key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
-       printbuf_exit(&key_description);
-       if (key_id < 0)
-               return errno;
-
-       keyctl_revoke(key_id);
-
-       return 0;
-}
-#endif
-
-int bch2_decrypt_sb_key(struct bch_fs *c,
-                       struct bch_sb_field_crypt *crypt,
-                       struct bch_key *key)
-{
-       struct bch_encrypted_key sb_key = crypt->key;
-       struct bch_key user_key;
-       int ret = 0;
-
-       /* is key encrypted? */
-       if (!bch2_key_is_encrypted(&sb_key))
-               goto out;
-
-       ret = bch2_request_key(c->disk_sb.sb, &user_key);
-       if (ret) {
-               bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
-               goto err;
-       }
-
-       /* decrypt real key: */
-       bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key));
-
-       if (bch2_key_is_encrypted(&sb_key)) {
-               bch_err(c, "incorrect encryption key");
-               ret = -EINVAL;
-               goto err;
-       }
-out:
-       *key = sb_key.key;
-err:
-       memzero_explicit(&sb_key, sizeof(sb_key));
-       memzero_explicit(&user_key, sizeof(user_key));
-       return ret;
-}
-
-#if 0
-
-/*
- * This seems to be duplicating code in cmd_remove_passphrase() in
- * bcachefs-tools, but we might want to switch userspace to use this - and
- * perhaps add an ioctl for calling this at runtime, so we can take the
- * passphrase off of a mounted filesystem (which has come up).
- */
-int bch2_disable_encryption(struct bch_fs *c)
-{
-       struct bch_sb_field_crypt *crypt;
-       struct bch_key key;
-       int ret = -EINVAL;
-
-       mutex_lock(&c->sb_lock);
-
-       crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
-       if (!crypt)
-               goto out;
-
-       /* is key encrypted? */
-       ret = 0;
-       if (bch2_key_is_encrypted(&crypt->key))
-               goto out;
-
-       ret = bch2_decrypt_sb_key(c, crypt, &key);
-       if (ret)
-               goto out;
-
-       crypt->key.magic        = cpu_to_le64(BCH_KEY_MAGIC);
-       crypt->key.key          = key;
-
-       SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
-       bch2_write_super(c);
-out:
-       mutex_unlock(&c->sb_lock);
-
-       return ret;
-}
-
-/*
- * For enabling encryption on an existing filesystem: not hooked up yet, but it
- * should be
- */
-int bch2_enable_encryption(struct bch_fs *c, bool keyed)
-{
-       struct bch_encrypted_key key;
-       struct bch_key user_key;
-       struct bch_sb_field_crypt *crypt;
-       int ret = -EINVAL;
-
-       mutex_lock(&c->sb_lock);
-
-       /* Do we already have an encryption key? */
-       if (bch2_sb_field_get(c->disk_sb.sb, crypt))
-               goto err;
-
-       ret = bch2_alloc_ciphers(c);
-       if (ret)
-               goto err;
-
-       key.magic = cpu_to_le64(BCH_KEY_MAGIC);
-       get_random_bytes(&key.key, sizeof(key.key));
-
-       if (keyed) {
-               ret = bch2_request_key(c->disk_sb.sb, &user_key);
-               if (ret) {
-                       bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
-                       goto err;
-               }
-
-               ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
-                                             &key, sizeof(key));
-               if (ret)
-                       goto err;
-       }
-
-       ret = crypto_skcipher_setkey(&c->chacha20->base,
-                       (void *) &key.key, sizeof(key.key));
-       if (ret)
-               goto err;
-
-       crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
-                                    sizeof(*crypt) / sizeof(u64));
-       if (!crypt) {
-               ret = bch_err_throw(c, ENOSPC_sb_crypt);
-               goto err;
-       }
-
-       crypt->key = key;
-
-       /* write superblock */
-       SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
-       bch2_write_super(c);
-err:
-       mutex_unlock(&c->sb_lock);
-       memzero_explicit(&user_key, sizeof(user_key));
-       memzero_explicit(&key, sizeof(key));
-       return ret;
-}
-#endif
-
-void bch2_fs_encryption_exit(struct bch_fs *c)
-{
-       memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key));
-}
-
-int bch2_fs_encryption_init(struct bch_fs *c)
-{
-       struct bch_sb_field_crypt *crypt;
-       int ret;
-
-       crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
-       if (!crypt)
-               return 0;
-
-       ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key);
-       if (ret)
-               return ret;
-       c->chacha20_key_set = true;
-       return 0;
-}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
deleted file mode 100644 (file)
index 7bd9cf6..0000000
+++ /dev/null
@@ -1,240 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CHECKSUM_H
-#define _BCACHEFS_CHECKSUM_H
-
-#include "bcachefs.h"
-#include "extents_types.h"
-#include "super-io.h"
-
-#include <linux/crc64.h>
-#include <crypto/chacha.h>
-
-static inline bool bch2_checksum_mergeable(unsigned type)
-{
-
-       switch (type) {
-       case BCH_CSUM_none:
-       case BCH_CSUM_crc32c:
-       case BCH_CSUM_crc64:
-               return true;
-       default:
-               return false;
-       }
-}
-
-struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
-                                   struct bch_csum, size_t);
-
-#define BCH_NONCE_EXTENT       cpu_to_le32(1 << 28)
-#define BCH_NONCE_BTREE                cpu_to_le32(2 << 28)
-#define BCH_NONCE_JOURNAL      cpu_to_le32(3 << 28)
-#define BCH_NONCE_PRIO         cpu_to_le32(4 << 28)
-#define BCH_NONCE_POLY         cpu_to_le32(1 << 31)
-
-struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
-                            const void *, size_t);
-
-/*
- * This is used for various on disk data structures - bch_sb, prio_set, bset,
- * jset: The checksum is _always_ the first field of these structs
- */
-#define csum_vstruct(_c, _type, _nonce, _i)                            \
-({                                                                     \
-       const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
-                                                                       \
-       bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
-})
-
-static inline void bch2_csum_to_text(struct printbuf *out,
-                                    enum bch_csum_type type,
-                                    struct bch_csum csum)
-{
-       const u8 *p = (u8 *) &csum;
-       unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16;
-
-       for (unsigned i = 0; i < bytes; i++)
-               prt_hex_byte(out, p[i]);
-}
-
-static inline void bch2_csum_err_msg(struct printbuf *out,
-                                    enum bch_csum_type type,
-                                    struct bch_csum expected,
-                                    struct bch_csum got)
-{
-       prt_str(out, "checksum error, type ");
-       bch2_prt_csum_type(out, type);
-       prt_str(out, ": got ");
-       bch2_csum_to_text(out, type, got);
-       prt_str(out, " should be ");
-       bch2_csum_to_text(out, type, expected);
-}
-
-void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t);
-
-int bch2_request_key(struct bch_sb *, struct bch_key *);
-#ifndef __KERNEL__
-int bch2_revoke_key(struct bch_sb *);
-#endif
-
-int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
-                void *data, size_t);
-
-struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
-                                 struct nonce, struct bio *);
-
-int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
-                       struct bch_extent_crc_unpacked,
-                       struct bch_extent_crc_unpacked *,
-                       struct bch_extent_crc_unpacked *,
-                       unsigned, unsigned, unsigned);
-
-int __bch2_encrypt_bio(struct bch_fs *, unsigned,
-                      struct nonce, struct bio *);
-
-static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
-                                  struct nonce nonce, struct bio *bio)
-{
-       return bch2_csum_type_is_encryption(type)
-               ? __bch2_encrypt_bio(c, type, nonce, bio)
-               : 0;
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
-
-int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
-                       struct bch_key *);
-
-#if 0
-int bch2_disable_encryption(struct bch_fs *);
-int bch2_enable_encryption(struct bch_fs *, bool);
-#endif
-
-void bch2_fs_encryption_exit(struct bch_fs *);
-int bch2_fs_encryption_init(struct bch_fs *);
-
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type,
-                                                      bool data)
-{
-       switch (type) {
-       case BCH_CSUM_OPT_none:
-               return BCH_CSUM_none;
-       case BCH_CSUM_OPT_crc32c:
-               return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
-       case BCH_CSUM_OPT_crc64:
-               return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
-       case BCH_CSUM_OPT_xxhash:
-               return BCH_CSUM_xxhash;
-       default:
-               BUG();
-       }
-}
-
-static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
-                                                        struct bch_io_opts opts)
-{
-       if (opts.nocow)
-               return 0;
-
-       if (c->sb.encryption_type)
-               return c->opts.wide_macs
-                       ? BCH_CSUM_chacha20_poly1305_128
-                       : BCH_CSUM_chacha20_poly1305_80;
-
-       return bch2_csum_opt_to_type(opts.data_checksum, true);
-}
-
-static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
-{
-       if (c->sb.encryption_type)
-               return BCH_CSUM_chacha20_poly1305_128;
-
-       return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
-}
-
-static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
-                                          unsigned type)
-{
-       if (type >= BCH_CSUM_NR)
-               return false;
-
-       if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set)
-               return false;
-
-       return true;
-}
-
-/* returns true if not equal */
-static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
-{
-       /*
-        * XXX: need some way of preventing the compiler from optimizing this
-        * into a form that isn't constant time..
-        */
-       return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
-}
-
-/* for skipping ahead and encrypting/decrypting at an offset: */
-static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
-{
-       EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
-
-       le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
-       return nonce;
-}
-
-static inline struct nonce null_nonce(void)
-{
-       struct nonce ret;
-
-       memset(&ret, 0, sizeof(ret));
-       return ret;
-}
-
-static inline struct nonce extent_nonce(struct bversion version,
-                                       struct bch_extent_crc_unpacked crc)
-{
-       unsigned compression_type = crc_is_compressed(crc)
-               ? crc.compression_type
-               : 0;
-       unsigned size = compression_type ? crc.uncompressed_size : 0;
-       struct nonce nonce = (struct nonce) {{
-               [0] = cpu_to_le32(size << 22),
-               [1] = cpu_to_le32(version.lo),
-               [2] = cpu_to_le32(version.lo >> 32),
-               [3] = cpu_to_le32(version.hi|
-                                 (compression_type << 24))^BCH_NONCE_EXTENT,
-       }};
-
-       return nonce_add(nonce, crc.nonce << 9);
-}
-
-static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
-{
-       return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
-}
-
-static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
-{
-       __le64 magic = __bch2_sb_magic(sb);
-
-       return (struct nonce) {{
-               [0] = 0,
-               [1] = 0,
-               [2] = ((__le32 *) &magic)[0],
-               [3] = ((__le32 *) &magic)[1],
-       }};
-}
-
-static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
-{
-       __le64 magic = bch2_sb_magic(c);
-
-       return (struct nonce) {{
-               [0] = 0,
-               [1] = 0,
-               [2] = ((__le32 *) &magic)[0],
-               [3] = ((__le32 *) &magic)[1],
-       }};
-}
-
-#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
deleted file mode 100644 (file)
index 8e9264b..0000000
+++ /dev/null
@@ -1,181 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "clock.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/preempt.h>
-
-static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args)
-{
-       struct io_timer **_l = (struct io_timer **)l;
-       struct io_timer **_r = (struct io_timer **)r;
-
-       return (*_l)->expire < (*_r)->expire;
-}
-
-static const struct min_heap_callbacks callbacks = {
-       .less = io_timer_cmp,
-       .swp = NULL,
-};
-
-void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
-{
-       spin_lock(&clock->timer_lock);
-
-       if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
-               spin_unlock(&clock->timer_lock);
-               timer->fn(timer);
-               return;
-       }
-
-       for (size_t i = 0; i < clock->timers.nr; i++)
-               if (clock->timers.data[i] == timer)
-                       goto out;
-
-       BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL));
-out:
-       spin_unlock(&clock->timer_lock);
-}
-
-void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
-{
-       spin_lock(&clock->timer_lock);
-
-       for (size_t i = 0; i < clock->timers.nr; i++)
-               if (clock->timers.data[i] == timer) {
-                       min_heap_del(&clock->timers, i, &callbacks, NULL);
-                       break;
-               }
-
-       spin_unlock(&clock->timer_lock);
-}
-
-struct io_clock_wait {
-       struct io_timer         io_timer;
-       struct task_struct      *task;
-       int                     expired;
-};
-
-static void io_clock_wait_fn(struct io_timer *timer)
-{
-       struct io_clock_wait *wait = container_of(timer,
-                               struct io_clock_wait, io_timer);
-
-       wait->expired = 1;
-       wake_up_process(wait->task);
-}
-
-void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
-{
-       struct io_clock_wait wait = {
-               .io_timer.expire        = until,
-               .io_timer.fn            = io_clock_wait_fn,
-               .io_timer.fn2           = (void *) _RET_IP_,
-               .task                   = current,
-       };
-
-       bch2_io_timer_add(clock, &wait.io_timer);
-       schedule();
-       bch2_io_timer_del(clock, &wait.io_timer);
-}
-
-unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock,
-                                    u64 io_until, unsigned long cpu_timeout)
-{
-       bool kthread = (current->flags & PF_KTHREAD) != 0;
-       struct io_clock_wait wait = {
-               .io_timer.expire        = io_until,
-               .io_timer.fn            = io_clock_wait_fn,
-               .io_timer.fn2           = (void *) _RET_IP_,
-               .task                   = current,
-       };
-
-       bch2_io_timer_add(clock, &wait.io_timer);
-
-       set_current_state(TASK_INTERRUPTIBLE);
-       if (!(kthread && kthread_should_stop())) {
-               cpu_timeout = schedule_timeout(cpu_timeout);
-               try_to_freeze();
-       }
-
-       __set_current_state(TASK_RUNNING);
-       bch2_io_timer_del(clock, &wait.io_timer);
-       return cpu_timeout;
-}
-
-void bch2_kthread_io_clock_wait(struct io_clock *clock,
-                               u64 io_until, unsigned long cpu_timeout)
-{
-       bool kthread = (current->flags & PF_KTHREAD) != 0;
-
-       while (!(kthread && kthread_should_stop()) &&
-              cpu_timeout &&
-              atomic64_read(&clock->now) < io_until)
-               cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout);
-}
-
-static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
-{
-       struct io_timer *ret = NULL;
-
-       if (clock->timers.nr &&
-           time_after_eq64(now, clock->timers.data[0]->expire)) {
-               ret = *min_heap_peek(&clock->timers);
-               min_heap_pop(&clock->timers, &callbacks, NULL);
-       }
-
-       return ret;
-}
-
-void __bch2_increment_clock(struct io_clock *clock, u64 sectors)
-{
-       struct io_timer *timer;
-       u64 now = atomic64_add_return(sectors, &clock->now);
-
-       spin_lock(&clock->timer_lock);
-       while ((timer = get_expired_timer(clock, now)))
-               timer->fn(timer);
-       spin_unlock(&clock->timer_lock);
-}
-
-void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
-{
-       out->atomic++;
-       spin_lock(&clock->timer_lock);
-       u64 now = atomic64_read(&clock->now);
-
-       printbuf_tabstop_push(out, 40);
-       prt_printf(out, "current time:\t%llu\n", now);
-
-       for (unsigned i = 0; i < clock->timers.nr; i++)
-               prt_printf(out, "%ps %ps:\t%llu\n",
-                      clock->timers.data[i]->fn,
-                      clock->timers.data[i]->fn2,
-                      clock->timers.data[i]->expire);
-       spin_unlock(&clock->timer_lock);
-       --out->atomic;
-}
-
-void bch2_io_clock_exit(struct io_clock *clock)
-{
-       free_heap(&clock->timers);
-       free_percpu(clock->pcpu_buf);
-}
-
-int bch2_io_clock_init(struct io_clock *clock)
-{
-       atomic64_set(&clock->now, 0);
-       spin_lock_init(&clock->timer_lock);
-
-       clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
-
-       clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
-       if (!clock->pcpu_buf)
-               return -BCH_ERR_ENOMEM_io_clock_init;
-
-       if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
-               return -BCH_ERR_ENOMEM_io_clock_init;
-
-       return 0;
-}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
deleted file mode 100644 (file)
index 8769be2..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CLOCK_H
-#define _BCACHEFS_CLOCK_H
-
-void bch2_io_timer_add(struct io_clock *, struct io_timer *);
-void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long);
-void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
-
-void __bch2_increment_clock(struct io_clock *, u64);
-
-static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
-                                       int rw)
-{
-       struct io_clock *clock = &c->io_clock[rw];
-
-       if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
-                  IO_CLOCK_PCPU_SECTORS))
-               __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
-}
-
-void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
-
-void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
-
-void bch2_io_clock_exit(struct io_clock *);
-int bch2_io_clock_init(struct io_clock *);
-
-#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
deleted file mode 100644 (file)
index 37554e4..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CLOCK_TYPES_H
-#define _BCACHEFS_CLOCK_TYPES_H
-
-#include "util.h"
-
-#define NR_IO_TIMERS           (BCH_SB_MEMBERS_MAX * 3)
-
-/*
- * Clocks/timers in units of sectors of IO:
- *
- * Note - they use percpu batching, so they're only approximate.
- */
-
-struct io_timer;
-typedef void (*io_timer_fn)(struct io_timer *);
-
-struct io_timer {
-       io_timer_fn             fn;
-       void                    *fn2;
-       u64                     expire;
-};
-
-/* Amount to buffer up on a percpu counter */
-#define IO_CLOCK_PCPU_SECTORS  128
-
-typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap)      io_timer_heap;
-
-struct io_clock {
-       atomic64_t              now;
-       u16 __percpu            *pcpu_buf;
-       unsigned                max_slop;
-
-       spinlock_t              timer_lock;
-       io_timer_heap           timers;
-};
-
-#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
deleted file mode 100644 (file)
index b37b1f3..0000000
+++ /dev/null
@@ -1,773 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "checksum.h"
-#include "compress.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "opts.h"
-#include "super-io.h"
-
-#include <linux/lz4.h>
-#include <linux/zlib.h>
-#include <linux/zstd.h>
-
-static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type)
-{
-       switch (type) {
-       case BCH_COMPRESSION_TYPE_none:
-       case BCH_COMPRESSION_TYPE_incompressible:
-               return BCH_COMPRESSION_OPT_none;
-       case BCH_COMPRESSION_TYPE_lz4_old:
-       case BCH_COMPRESSION_TYPE_lz4:
-               return BCH_COMPRESSION_OPT_lz4;
-       case BCH_COMPRESSION_TYPE_gzip:
-               return BCH_COMPRESSION_OPT_gzip;
-       case BCH_COMPRESSION_TYPE_zstd:
-               return BCH_COMPRESSION_OPT_zstd;
-       default:
-               BUG();
-       }
-}
-
-/* Bounce buffer: */
-struct bbuf {
-       void            *b;
-       enum {
-               BB_NONE,
-               BB_VMAP,
-               BB_KMALLOC,
-               BB_MEMPOOL,
-       }               type;
-       int             rw;
-};
-
-static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
-{
-       void *b;
-
-       BUG_ON(size > c->opts.encoded_extent_max);
-
-       b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
-       if (b)
-               return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
-
-       b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
-       if (b)
-               return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
-
-       BUG();
-}
-
-static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
-{
-       struct bio_vec bv;
-       struct bvec_iter iter;
-       void *expected_start = NULL;
-
-       __bio_for_each_bvec(bv, bio, iter, start) {
-               if (expected_start &&
-                   expected_start != page_address(bv.bv_page) + bv.bv_offset)
-                       return false;
-
-               expected_start = page_address(bv.bv_page) +
-                       bv.bv_offset + bv.bv_len;
-       }
-
-       return true;
-}
-
-static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
-                                      struct bvec_iter start, int rw)
-{
-       struct bbuf ret;
-       struct bio_vec bv;
-       struct bvec_iter iter;
-       unsigned nr_pages = 0;
-       struct page *stack_pages[16];
-       struct page **pages = NULL;
-       void *data;
-
-       BUG_ON(start.bi_size > c->opts.encoded_extent_max);
-
-       if (!PageHighMem(bio_iter_page(bio, start)) &&
-           bio_phys_contig(bio, start))
-               return (struct bbuf) {
-                       .b = page_address(bio_iter_page(bio, start)) +
-                               bio_iter_offset(bio, start),
-                       .type = BB_NONE, .rw = rw
-               };
-
-       /* check if we can map the pages contiguously: */
-       __bio_for_each_segment(bv, bio, iter, start) {
-               if (iter.bi_size != start.bi_size &&
-                   bv.bv_offset)
-                       goto bounce;
-
-               if (bv.bv_len < iter.bi_size &&
-                   bv.bv_offset + bv.bv_len < PAGE_SIZE)
-                       goto bounce;
-
-               nr_pages++;
-       }
-
-       BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
-
-       pages = nr_pages > ARRAY_SIZE(stack_pages)
-               ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
-               : stack_pages;
-       if (!pages)
-               goto bounce;
-
-       nr_pages = 0;
-       __bio_for_each_segment(bv, bio, iter, start)
-               pages[nr_pages++] = bv.bv_page;
-
-       data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
-       if (pages != stack_pages)
-               kfree(pages);
-
-       if (data)
-               return (struct bbuf) {
-                       .b = data + bio_iter_offset(bio, start),
-                       .type = BB_VMAP, .rw = rw
-               };
-bounce:
-       ret = __bounce_alloc(c, start.bi_size, rw);
-
-       if (rw == READ)
-               memcpy_from_bio(ret.b, bio, start);
-
-       return ret;
-}
-
-static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
-{
-       return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
-}
-
-static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
-{
-       switch (buf.type) {
-       case BB_NONE:
-               break;
-       case BB_VMAP:
-               vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
-               break;
-       case BB_KMALLOC:
-               kfree(buf.b);
-               break;
-       case BB_MEMPOOL:
-               mempool_free(buf.b, &c->compression_bounce[buf.rw]);
-               break;
-       }
-}
-
-static inline void zlib_set_workspace(z_stream *strm, void *workspace)
-{
-#ifdef __KERNEL__
-       strm->workspace = workspace;
-#endif
-}
-
-static int __bio_uncompress(struct bch_fs *c, struct bio *src,
-                           void *dst_data, struct bch_extent_crc_unpacked crc)
-{
-       struct bbuf src_data = { NULL };
-       size_t src_len = src->bi_iter.bi_size;
-       size_t dst_len = crc.uncompressed_size << 9;
-       void *workspace;
-       int ret = 0, ret2;
-
-       enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
-       mempool_t *workspace_pool = &c->compress_workspace[opt];
-       if (unlikely(!mempool_initialized(workspace_pool))) {
-               if (fsck_err(c, compression_type_not_marked_in_sb,
-                            "compression type %s set but not marked in superblock",
-                            __bch2_compression_types[crc.compression_type]))
-                       ret = bch2_check_set_has_compressed_data(c, opt);
-               else
-                       ret = bch_err_throw(c, compression_workspace_not_initialized);
-               if (ret)
-                       goto err;
-       }
-
-       src_data = bio_map_or_bounce(c, src, READ);
-
-       switch (crc.compression_type) {
-       case BCH_COMPRESSION_TYPE_lz4_old:
-       case BCH_COMPRESSION_TYPE_lz4:
-               ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
-                                                  src_len, dst_len, dst_len);
-               if (ret2 != dst_len)
-                       ret = bch_err_throw(c, decompress_lz4);
-               break;
-       case BCH_COMPRESSION_TYPE_gzip: {
-               z_stream strm = {
-                       .next_in        = src_data.b,
-                       .avail_in       = src_len,
-                       .next_out       = dst_data,
-                       .avail_out      = dst_len,
-               };
-
-               workspace = mempool_alloc(workspace_pool, GFP_NOFS);
-
-               zlib_set_workspace(&strm, workspace);
-               zlib_inflateInit2(&strm, -MAX_WBITS);
-               ret2 = zlib_inflate(&strm, Z_FINISH);
-
-               mempool_free(workspace, workspace_pool);
-
-               if (ret2 != Z_STREAM_END)
-                       ret = bch_err_throw(c, decompress_gzip);
-               break;
-       }
-       case BCH_COMPRESSION_TYPE_zstd: {
-               ZSTD_DCtx *ctx;
-               size_t real_src_len = le32_to_cpup(src_data.b);
-
-               if (real_src_len > src_len - 4) {
-                       ret = bch_err_throw(c, decompress_zstd_src_len_bad);
-                       goto err;
-               }
-
-               workspace = mempool_alloc(workspace_pool, GFP_NOFS);
-               ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
-
-               ret2 = zstd_decompress_dctx(ctx,
-                               dst_data,       dst_len,
-                               src_data.b + 4, real_src_len);
-
-               mempool_free(workspace, workspace_pool);
-
-               if (ret2 != dst_len)
-                       ret = bch_err_throw(c, decompress_zstd);
-               break;
-       }
-       default:
-               BUG();
-       }
-err:
-fsck_err:
-       bio_unmap_or_unbounce(c, src_data);
-       return ret;
-}
-
-int bch2_bio_uncompress_inplace(struct bch_write_op *op,
-                               struct bio *bio)
-{
-       struct bch_fs *c = op->c;
-       struct bch_extent_crc_unpacked *crc = &op->crc;
-       struct bbuf data = { NULL };
-       size_t dst_len = crc->uncompressed_size << 9;
-       int ret = 0;
-
-       /* bio must own its pages: */
-       BUG_ON(!bio->bi_vcnt);
-       BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
-
-       if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) {
-               bch2_write_op_error(op, op->pos.offset,
-                                   "extent too big to decompress (%u > %u)",
-                                   crc->uncompressed_size << 9, c->opts.encoded_extent_max);
-               return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
-       }
-
-       data = __bounce_alloc(c, dst_len, WRITE);
-
-       ret = __bio_uncompress(c, bio, data.b, *crc);
-
-       if (c->opts.no_data_io)
-               ret = 0;
-
-       if (ret) {
-               bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret));
-               goto err;
-       }
-
-       /*
-        * XXX: don't have a good way to assert that the bio was allocated with
-        * enough space, we depend on bch2_move_extent doing the right thing
-        */
-       bio->bi_iter.bi_size = crc->live_size << 9;
-
-       memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
-
-       crc->csum_type          = 0;
-       crc->compression_type   = 0;
-       crc->compressed_size    = crc->live_size;
-       crc->uncompressed_size  = crc->live_size;
-       crc->offset             = 0;
-       crc->csum               = (struct bch_csum) { 0, 0 };
-err:
-       bio_unmap_or_unbounce(c, data);
-       return ret;
-}
-
-int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
-                      struct bio *dst, struct bvec_iter dst_iter,
-                      struct bch_extent_crc_unpacked crc)
-{
-       struct bbuf dst_data = { NULL };
-       size_t dst_len = crc.uncompressed_size << 9;
-       int ret;
-
-       if (crc.uncompressed_size << 9  > c->opts.encoded_extent_max ||
-           crc.compressed_size << 9    > c->opts.encoded_extent_max)
-               return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
-
-       dst_data = dst_len == dst_iter.bi_size
-               ? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
-               : __bounce_alloc(c, dst_len, WRITE);
-
-       ret = __bio_uncompress(c, src, dst_data.b, crc);
-       if (ret)
-               goto err;
-
-       if (dst_data.type != BB_NONE &&
-           dst_data.type != BB_VMAP)
-               memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
-err:
-       bio_unmap_or_unbounce(c, dst_data);
-       return ret;
-}
-
-static int attempt_compress(struct bch_fs *c,
-                           void *workspace,
-                           void *dst, size_t dst_len,
-                           void *src, size_t src_len,
-                           struct bch_compression_opt compression)
-{
-       enum bch_compression_type compression_type =
-               __bch2_compression_opt_to_type[compression.type];
-
-       switch (compression_type) {
-       case BCH_COMPRESSION_TYPE_lz4:
-               if (compression.level < LZ4HC_MIN_CLEVEL) {
-                       int len = src_len;
-                       int ret = LZ4_compress_destSize(
-                                       src,            dst,
-                                       &len,           dst_len,
-                                       workspace);
-                       if (len < src_len)
-                               return -len;
-
-                       return ret;
-               } else {
-                       int ret = LZ4_compress_HC(
-                                       src,            dst,
-                                       src_len,        dst_len,
-                                       compression.level,
-                                       workspace);
-
-                       return ret ?: -1;
-               }
-       case BCH_COMPRESSION_TYPE_gzip: {
-               z_stream strm = {
-                       .next_in        = src,
-                       .avail_in       = src_len,
-                       .next_out       = dst,
-                       .avail_out      = dst_len,
-               };
-
-               zlib_set_workspace(&strm, workspace);
-               if (zlib_deflateInit2(&strm,
-                                 compression.level
-                                 ? clamp_t(unsigned, compression.level,
-                                           Z_BEST_SPEED, Z_BEST_COMPRESSION)
-                                 : Z_DEFAULT_COMPRESSION,
-                                 Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
-                                 Z_DEFAULT_STRATEGY) != Z_OK)
-                       return 0;
-
-               if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
-                       return 0;
-
-               if (zlib_deflateEnd(&strm) != Z_OK)
-                       return 0;
-
-               return strm.total_out;
-       }
-       case BCH_COMPRESSION_TYPE_zstd: {
-               /*
-                * rescale:
-                * zstd max compression level is 22, our max level is 15
-                */
-               unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
-               ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
-               ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
-
-               /*
-                * ZSTD requires that when we decompress we pass in the exact
-                * compressed size - rounding it up to the nearest sector
-                * doesn't work, so we use the first 4 bytes of the buffer for
-                * that.
-                *
-                * Additionally, the ZSTD code seems to have a bug where it will
-                * write just past the end of the buffer - so subtract a fudge
-                * factor (7 bytes) from the dst buffer size to account for
-                * that.
-                */
-               size_t len = zstd_compress_cctx(ctx,
-                               dst + 4,        dst_len - 4 - 7,
-                               src,            src_len,
-                               &params);
-               if (zstd_is_error(len))
-                       return 0;
-
-               *((__le32 *) dst) = cpu_to_le32(len);
-               return len + 4;
-       }
-       default:
-               BUG();
-       }
-}
-
-static unsigned __bio_compress(struct bch_fs *c,
-                              struct bio *dst, size_t *dst_len,
-                              struct bio *src, size_t *src_len,
-                              struct bch_compression_opt compression)
-{
-       struct bbuf src_data = { NULL }, dst_data = { NULL };
-       void *workspace;
-       enum bch_compression_type compression_type =
-               __bch2_compression_opt_to_type[compression.type];
-       unsigned pad;
-       int ret = 0;
-
-       /* bch2_compression_decode catches unknown compression types: */
-       BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR);
-
-       mempool_t *workspace_pool = &c->compress_workspace[compression.type];
-       if (unlikely(!mempool_initialized(workspace_pool))) {
-               if (fsck_err(c, compression_opt_not_marked_in_sb,
-                            "compression opt %s set but not marked in superblock",
-                            bch2_compression_opts[compression.type])) {
-                       ret = bch2_check_set_has_compressed_data(c, compression.type);
-                       if (ret) /* memory allocation failure, don't compress */
-                               return 0;
-               } else {
-                       return 0;
-               }
-       }
-
-       /* If it's only one block, don't bother trying to compress: */
-       if (src->bi_iter.bi_size <= c->opts.block_size)
-               return BCH_COMPRESSION_TYPE_incompressible;
-
-       dst_data = bio_map_or_bounce(c, dst, WRITE);
-       src_data = bio_map_or_bounce(c, src, READ);
-
-       workspace = mempool_alloc(workspace_pool, GFP_NOFS);
-
-       *src_len = src->bi_iter.bi_size;
-       *dst_len = dst->bi_iter.bi_size;
-
-       /*
-        * XXX: this algorithm sucks when the compression code doesn't tell us
-        * how much would fit, like LZ4 does:
-        */
-       while (1) {
-               if (*src_len <= block_bytes(c)) {
-                       ret = -1;
-                       break;
-               }
-
-               ret = attempt_compress(c, workspace,
-                                      dst_data.b,      *dst_len,
-                                      src_data.b,      *src_len,
-                                      compression);
-               if (ret > 0) {
-                       *dst_len = ret;
-                       ret = 0;
-                       break;
-               }
-
-               /* Didn't fit: should we retry with a smaller amount?  */
-               if (*src_len <= *dst_len) {
-                       ret = -1;
-                       break;
-               }
-
-               /*
-                * If ret is negative, it's a hint as to how much data would fit
-                */
-               BUG_ON(-ret >= *src_len);
-
-               if (ret < 0)
-                       *src_len = -ret;
-               else
-                       *src_len -= (*src_len - *dst_len) / 2;
-               *src_len = round_down(*src_len, block_bytes(c));
-       }
-
-       mempool_free(workspace, workspace_pool);
-
-       if (ret)
-               goto err;
-
-       /* Didn't get smaller: */
-       if (round_up(*dst_len, block_bytes(c)) >= *src_len)
-               goto err;
-
-       pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
-
-       memset(dst_data.b + *dst_len, 0, pad);
-       *dst_len += pad;
-
-       if (dst_data.type != BB_NONE &&
-           dst_data.type != BB_VMAP)
-               memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
-
-       BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
-       BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
-       BUG_ON(*dst_len & (block_bytes(c) - 1));
-       BUG_ON(*src_len & (block_bytes(c) - 1));
-       ret = compression_type;
-out:
-       bio_unmap_or_unbounce(c, src_data);
-       bio_unmap_or_unbounce(c, dst_data);
-       return ret;
-err:
-       ret = BCH_COMPRESSION_TYPE_incompressible;
-       goto out;
-fsck_err:
-       ret = 0;
-       goto out;
-}
-
-unsigned bch2_bio_compress(struct bch_fs *c,
-                          struct bio *dst, size_t *dst_len,
-                          struct bio *src, size_t *src_len,
-                          unsigned compression_opt)
-{
-       unsigned orig_dst = dst->bi_iter.bi_size;
-       unsigned orig_src = src->bi_iter.bi_size;
-       unsigned compression_type;
-
-       /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
-       src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
-                                    c->opts.encoded_extent_max);
-       /* Don't generate a bigger output than input: */
-       dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-
-       compression_type =
-               __bio_compress(c, dst, dst_len, src, src_len,
-                              bch2_compression_decode(compression_opt));
-
-       dst->bi_iter.bi_size = orig_dst;
-       src->bi_iter.bi_size = orig_src;
-       return compression_type;
-}
-
-static int __bch2_fs_compress_init(struct bch_fs *, u64);
-
-#define BCH_FEATURE_none       0
-
-static const unsigned bch2_compression_opt_to_feature[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
-       BCH_COMPRESSION_OPTS()
-#undef x
-};
-
-#undef BCH_FEATURE_none
-
-static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
-{
-       int ret = 0;
-
-       if ((c->sb.features & f) == f)
-               return 0;
-
-       mutex_lock(&c->sb_lock);
-
-       if ((c->sb.features & f) == f) {
-               mutex_unlock(&c->sb_lock);
-               return 0;
-       }
-
-       ret = __bch2_fs_compress_init(c, c->sb.features|f);
-       if (ret) {
-               mutex_unlock(&c->sb_lock);
-               return ret;
-       }
-
-       c->disk_sb.sb->features[0] |= cpu_to_le64(f);
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       return 0;
-}
-
-int bch2_check_set_has_compressed_data(struct bch_fs *c,
-                                      unsigned compression_opt)
-{
-       unsigned compression_type = bch2_compression_decode(compression_opt).type;
-
-       BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
-
-       return compression_type
-               ? __bch2_check_set_has_compressed_data(c,
-                               1ULL << bch2_compression_opt_to_feature[compression_type])
-               : 0;
-}
-
-void bch2_fs_compress_exit(struct bch_fs *c)
-{
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
-               mempool_exit(&c->compress_workspace[i]);
-       mempool_exit(&c->compression_bounce[WRITE]);
-       mempool_exit(&c->compression_bounce[READ]);
-}
-
-static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
-{
-       ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
-                                                c->opts.encoded_extent_max);
-
-       c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
-
-       struct {
-               unsigned                        feature;
-               enum bch_compression_opts       type;
-               size_t                          compress_workspace;
-       } compression_types[] = {
-               { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4,
-                       max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
-               { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip,
-                       max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
-                           zlib_inflate_workspacesize()) },
-               { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd,
-                       max(c->zstd_workspace_size,
-                           zstd_dctx_workspace_bound()) },
-       }, *i;
-       bool have_compressed = false;
-
-       for (i = compression_types;
-            i < compression_types + ARRAY_SIZE(compression_types);
-            i++)
-               have_compressed |= (features & (1 << i->feature)) != 0;
-
-       if (!have_compressed)
-               return 0;
-
-       if (!mempool_initialized(&c->compression_bounce[READ]) &&
-           mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
-                                      1, c->opts.encoded_extent_max))
-               return bch_err_throw(c, ENOMEM_compression_bounce_read_init);
-
-       if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
-           mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
-                                      1, c->opts.encoded_extent_max))
-               return bch_err_throw(c, ENOMEM_compression_bounce_write_init);
-
-       for (i = compression_types;
-            i < compression_types + ARRAY_SIZE(compression_types);
-            i++) {
-               if (!(features & (1 << i->feature)))
-                       continue;
-
-               if (mempool_initialized(&c->compress_workspace[i->type]))
-                       continue;
-
-               if (mempool_init_kvmalloc_pool(
-                               &c->compress_workspace[i->type],
-                               1, i->compress_workspace))
-                       return bch_err_throw(c, ENOMEM_compression_workspace_init);
-       }
-
-       return 0;
-}
-
-static u64 compression_opt_to_feature(unsigned v)
-{
-       unsigned type = bch2_compression_decode(v).type;
-
-       return BIT_ULL(bch2_compression_opt_to_feature[type]);
-}
-
-int bch2_fs_compress_init(struct bch_fs *c)
-{
-       u64 f = c->sb.features;
-
-       f |= compression_opt_to_feature(c->opts.compression);
-       f |= compression_opt_to_feature(c->opts.background_compression);
-
-       return __bch2_fs_compress_init(c, f);
-}
-
-int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
-                              struct printbuf *err)
-{
-       char *val = kstrdup(_val, GFP_KERNEL);
-       char *p = val, *type_str, *level_str;
-       struct bch_compression_opt opt = { 0 };
-       int ret;
-
-       if (!val)
-               return -ENOMEM;
-
-       type_str = strsep(&p, ":");
-       level_str = p;
-
-       ret = match_string(bch2_compression_opts, -1, type_str);
-       if (ret < 0 && err)
-               prt_printf(err, "invalid compression type\n");
-       if (ret < 0)
-               goto err;
-
-       opt.type = ret;
-
-       if (level_str) {
-               unsigned level;
-
-               ret = kstrtouint(level_str, 10, &level);
-               if (!ret && !opt.type && level)
-                       ret = -EINVAL;
-               if (!ret && level > 15)
-                       ret = -EINVAL;
-               if (ret < 0 && err)
-                       prt_printf(err, "invalid compression level\n");
-               if (ret < 0)
-                       goto err;
-
-               opt.level = level;
-       }
-
-       *res = bch2_compression_encode(opt);
-err:
-       kfree(val);
-       return ret;
-}
-
-void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
-{
-       struct bch_compression_opt opt = bch2_compression_decode(v);
-
-       if (opt.type < BCH_COMPRESSION_OPT_NR)
-               prt_str(out, bch2_compression_opts[opt.type]);
-       else
-               prt_printf(out, "(unknown compression opt %u)", opt.type);
-       if (opt.level)
-               prt_printf(out, ":%u", opt.level);
-}
-
-void bch2_opt_compression_to_text(struct printbuf *out,
-                                 struct bch_fs *c,
-                                 struct bch_sb *sb,
-                                 u64 v)
-{
-       return bch2_compression_opt_to_text(out, v);
-}
-
-int bch2_opt_compression_validate(u64 v, struct printbuf *err)
-{
-       if (!bch2_compression_opt_valid(v)) {
-               prt_printf(err, "invalid compression opt %llu", v);
-               return -BCH_ERR_invalid_sb_opt_compression;
-       }
-
-       return 0;
-}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
deleted file mode 100644 (file)
index bec2f05..0000000
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COMPRESS_H
-#define _BCACHEFS_COMPRESS_H
-
-#include "extents_types.h"
-
-static const unsigned __bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
-       BCH_COMPRESSION_OPTS()
-#undef x
-};
-
-struct bch_compression_opt {
-       u8              type:4,
-                       level:4;
-};
-
-static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
-{
-       return (struct bch_compression_opt) {
-               .type   = v & 15,
-               .level  = v >> 4,
-       };
-}
-
-static inline bool bch2_compression_opt_valid(unsigned v)
-{
-       struct bch_compression_opt opt = __bch2_compression_decode(v);
-
-       return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
-}
-
-static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
-{
-       return bch2_compression_opt_valid(v)
-               ? __bch2_compression_decode(v)
-               : (struct bch_compression_opt) { 0 };
-}
-
-static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
-{
-       return opt.type|(opt.level << 4);
-}
-
-static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
-{
-       return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
-}
-
-struct bch_write_op;
-int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *);
-int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
-                      struct bvec_iter, struct bch_extent_crc_unpacked);
-unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
-                          struct bio *, size_t *, unsigned);
-
-int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
-void bch2_fs_compress_exit(struct bch_fs *);
-int bch2_fs_compress_init(struct bch_fs *);
-
-void bch2_compression_opt_to_text(struct printbuf *, u64);
-
-int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
-void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-int bch2_opt_compression_validate(u64, struct printbuf *);
-
-#define bch2_opt_compression (struct bch_opt_fn) {             \
-       .parse          = bch2_opt_compression_parse,           \
-       .to_text        = bch2_opt_compression_to_text,         \
-       .validate       = bch2_opt_compression_validate,        \
-}
-
-#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
deleted file mode 100644 (file)
index e86d36d..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/log2.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include "darray.h"
-
-int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
-{
-       if (new_size > d->size) {
-               new_size = roundup_pow_of_two(new_size);
-
-               /*
-                * This is a workaround: kvmalloc() doesn't support > INT_MAX
-                * allocations, but vmalloc() does.
-                * The limit needs to be lifted from kvmalloc, and when it does
-                * we'll go back to just using that.
-                */
-               size_t bytes;
-               if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
-                       return -ENOMEM;
-
-               void *data = likely(bytes < INT_MAX)
-                       ? kvmalloc_noprof(bytes, gfp)
-                       : vmalloc_noprof(bytes);
-               if (!data)
-                       return -ENOMEM;
-
-               if (d->size)
-                       memcpy(data, d->data, d->size * element_size);
-               if (d->data != d->preallocated)
-                       kvfree(d->data);
-               d->data = data;
-               d->size = new_size;
-       }
-
-       return 0;
-}
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
deleted file mode 100644 (file)
index 4080ee9..0000000
+++ /dev/null
@@ -1,158 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DARRAY_H
-#define _BCACHEFS_DARRAY_H
-
-/*
- * Dynamic arrays:
- *
- * Inspired by CCAN's darray
- */
-
-#include <linux/cleanup.h>
-#include <linux/slab.h>
-
-#define DARRAY_PREALLOCATED(_type, _nr)                                        \
-struct {                                                               \
-       size_t nr, size;                                                \
-       _type *data;                                                    \
-       _type preallocated[_nr];                                        \
-}
-
-#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
-
-typedef DARRAY(char)   darray_char;
-typedef DARRAY(char *) darray_str;
-typedef DARRAY(const char *) darray_const_str;
-
-typedef DARRAY(u8)     darray_u8;
-typedef DARRAY(u16)    darray_u16;
-typedef DARRAY(u32)    darray_u32;
-typedef DARRAY(u64)    darray_u64;
-
-typedef DARRAY(s8)     darray_s8;
-typedef DARRAY(s16)    darray_s16;
-typedef DARRAY(s32)    darray_s32;
-typedef DARRAY(s64)    darray_s64;
-
-int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
-
-#define __bch2_darray_resize(...)      alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
-
-#define __darray_resize(_d, _element_size, _new_size, _gfp)            \
-       (unlikely((_new_size) > (_d)->size)                             \
-        ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
-        : 0)
-
-#define darray_resize_gfp(_d, _new_size, _gfp)                         \
-       __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
-
-#define darray_resize(_d, _new_size)                                   \
-       darray_resize_gfp(_d, _new_size, GFP_KERNEL)
-
-#define darray_make_room_gfp(_d, _more, _gfp)                          \
-       darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
-
-#define darray_make_room(_d, _more)                                    \
-       darray_make_room_gfp(_d, _more, GFP_KERNEL)
-
-#define darray_room(_d)                ((_d).size - (_d).nr)
-
-#define darray_top(_d)         ((_d).data[(_d).nr])
-
-#define darray_push_gfp(_d, _item, _gfp)                               \
-({                                                                     \
-       int _ret = darray_make_room_gfp((_d), 1, _gfp);                 \
-                                                                       \
-       if (!_ret)                                                      \
-               (_d)->data[(_d)->nr++] = (_item);                       \
-       _ret;                                                           \
-})
-
-#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL)
-
-#define darray_pop(_d)         ((_d)->data[--(_d)->nr])
-
-#define darray_first(_d)       ((_d).data[0])
-#define darray_last(_d)                ((_d).data[(_d).nr - 1])
-
-#define darray_insert_item(_d, pos, _item)                             \
-({                                                                     \
-       size_t _pos = (pos);                                            \
-       int _ret = darray_make_room((_d), 1);                           \
-                                                                       \
-       if (!_ret)                                                      \
-               array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \
-       _ret;                                                           \
-})
-
-#define darray_remove_item(_d, _pos)                                   \
-       array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
-
-#define darray_find_p(_d, _i, cond)                                    \
-({                                                                     \
-       typeof((_d).data) _ret = NULL;                                  \
-                                                                       \
-       darray_for_each(_d, _i)                                         \
-               if (cond) {                                             \
-                       _ret = _i;                                      \
-                       break;                                          \
-               }                                                       \
-       _ret;                                                           \
-})
-
-#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item)
-
-/* Iteration: */
-
-#define __darray_for_each(_d, _i)                                      \
-       for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
-
-#define darray_for_each(_d, _i)                                                \
-       for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
-
-#define darray_for_each_reverse(_d, _i)                                        \
-       for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
-
-/* Init/exit */
-
-#define darray_init(_d)                                                        \
-do {                                                                   \
-       (_d)->nr = 0;                                                   \
-       (_d)->size = ARRAY_SIZE((_d)->preallocated);                    \
-       (_d)->data = (_d)->size ? (_d)->preallocated : NULL;            \
-} while (0)
-
-#define darray_exit(_d)                                                        \
-do {                                                                   \
-       if (!ARRAY_SIZE((_d)->preallocated) ||                          \
-           (_d)->data != (_d)->preallocated)                           \
-               kvfree((_d)->data);                                     \
-       darray_init(_d);                                                \
-} while (0)
-
-#define DEFINE_DARRAY_CLASS(_type)                                     \
-DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void)
-
-#define DEFINE_DARRAY(_type)                                           \
-typedef DARRAY(_type)  darray_##_type;                                 \
-DEFINE_DARRAY_CLASS(darray_##_type)
-
-#define DEFINE_DARRAY_NAMED(_name, _type)                              \
-typedef DARRAY(_type)  _name;                                          \
-DEFINE_DARRAY_CLASS(_name)
-
-DEFINE_DARRAY_CLASS(darray_char);
-DEFINE_DARRAY_CLASS(darray_str)
-DEFINE_DARRAY_CLASS(darray_const_str)
-
-DEFINE_DARRAY_CLASS(darray_u8)
-DEFINE_DARRAY_CLASS(darray_u16)
-DEFINE_DARRAY_CLASS(darray_u32)
-DEFINE_DARRAY_CLASS(darray_u64)
-
-DEFINE_DARRAY_CLASS(darray_s8)
-DEFINE_DARRAY_CLASS(darray_s16)
-DEFINE_DARRAY_CLASS(darray_s32)
-DEFINE_DARRAY_CLASS(darray_s64)
-
-#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
deleted file mode 100644 (file)
index e848e21..0000000
+++ /dev/null
@@ -1,1021 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "compress.h"
-#include "data_update.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-
-static const char * const bch2_data_update_type_strs[] = {
-#define x(t, n, ...) [n] = #t,
-       BCH_DATA_UPDATE_TYPES()
-#undef x
-       NULL
-};
-
-static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-       bkey_for_each_ptr(ptrs, ptr)
-               bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
-}
-
-static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-       bkey_for_each_ptr(ptrs, ptr) {
-               if (unlikely(!bch2_dev_tryget(c, ptr->dev))) {
-                       bkey_for_each_ptr(ptrs, ptr2) {
-                               if (ptr2 == ptr)
-                                       break;
-                               bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
-                       }
-                       return false;
-               }
-       }
-       return true;
-}
-
-static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-       bkey_for_each_ptr(ptrs, ptr) {
-               struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-               struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-               bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
-       }
-}
-
-static noinline_for_stack
-bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs,
-                      const struct bch_extent_ptr *start)
-{
-       if (!ctxt) {
-               bkey_for_each_ptr(ptrs, ptr) {
-                       if (ptr == start)
-                               break;
-
-                       struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-                       struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-                       bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
-               }
-               return false;
-       }
-
-       __bkey_for_each_ptr(start, ptrs.end, ptr) {
-               struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-               struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-               bool locked;
-               move_ctxt_wait_event(ctxt,
-                                    (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
-                                    list_empty(&ctxt->ios));
-               if (!locked)
-                       bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
-       }
-       return true;
-}
-
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs)
-{
-       bkey_for_each_ptr(ptrs, ptr) {
-               struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-               struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-               if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0))
-                       return __bkey_nocow_lock(c, ctxt, ptrs, ptr);
-       }
-
-       return true;
-}
-
-noinline_for_stack
-static void trace_io_move_finish2(struct data_update *u,
-                                 struct bkey_i *new,
-                                 struct bkey_i *insert)
-{
-       struct bch_fs *c = u->op.c;
-       struct printbuf buf = PRINTBUF;
-
-       prt_newline(&buf);
-
-       bch2_data_update_to_text(&buf, u);
-       prt_newline(&buf);
-
-       prt_str_indented(&buf, "new replicas:\t");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
-       prt_newline(&buf);
-
-       prt_str_indented(&buf, "insert:\t");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-       prt_newline(&buf);
-
-       trace_io_move_finish(c, buf.buf);
-       printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static void trace_io_move_fail2(struct data_update *m,
-                        struct bkey_s_c new,
-                        struct bkey_s_c wrote,
-                        struct bkey_i *insert,
-                        const char *msg)
-{
-       struct bch_fs *c = m->op.c;
-       struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
-       struct printbuf buf = PRINTBUF;
-       unsigned rewrites_found = 0;
-
-       if (!trace_io_move_fail_enabled())
-               return;
-
-       prt_str(&buf, msg);
-
-       if (insert) {
-               const union bch_extent_entry *entry;
-               struct bch_extent_ptr *ptr;
-               struct extent_ptr_decoded p;
-
-               unsigned ptr_bit = 1;
-               bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
-                       if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
-                           (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
-                           !ptr->cached)
-                               rewrites_found |= ptr_bit;
-                       ptr_bit <<= 1;
-               }
-       }
-
-       prt_str(&buf, "rewrites found:\t");
-       bch2_prt_u64_base2(&buf, rewrites_found);
-       prt_newline(&buf);
-
-       bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
-       prt_str(&buf, "\nold:    ");
-       bch2_bkey_val_to_text(&buf, c, old);
-
-       prt_str(&buf, "\nnew:    ");
-       bch2_bkey_val_to_text(&buf, c, new);
-
-       prt_str(&buf, "\nwrote:  ");
-       bch2_bkey_val_to_text(&buf, c, wrote);
-
-       if (insert) {
-               prt_str(&buf, "\ninsert: ");
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-       }
-
-       trace_io_move_fail(c, buf.buf);
-       printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static void trace_data_update2(struct data_update *m,
-                              struct bkey_s_c old, struct bkey_s_c k,
-                              struct bkey_i *insert)
-{
-       struct bch_fs *c = m->op.c;
-       struct printbuf buf = PRINTBUF;
-
-       prt_str(&buf, "\nold: ");
-       bch2_bkey_val_to_text(&buf, c, old);
-       prt_str(&buf, "\nk:   ");
-       bch2_bkey_val_to_text(&buf, c, k);
-       prt_str(&buf, "\nnew: ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
-       trace_data_update(c, buf.buf);
-       printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static void trace_io_move_created_rebalance2(struct data_update *m,
-                                            struct bkey_s_c old, struct bkey_s_c k,
-                                            struct bkey_i *insert)
-{
-       struct bch_fs *c = m->op.c;
-       struct printbuf buf = PRINTBUF;
-
-       bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
-       prt_str(&buf, "\nold: ");
-       bch2_bkey_val_to_text(&buf, c, old);
-       prt_str(&buf, "\nk:   ");
-       bch2_bkey_val_to_text(&buf, c, k);
-       prt_str(&buf, "\nnew: ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
-       trace_io_move_created_rebalance(c, buf.buf);
-       printbuf_exit(&buf);
-
-       this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]);
-}
-
-noinline_for_stack
-static int data_update_invalid_bkey(struct data_update *m,
-                                   struct bkey_s_c old, struct bkey_s_c k,
-                                   struct bkey_i *insert)
-{
-       struct bch_fs *c = m->op.c;
-       struct printbuf buf = PRINTBUF;
-       bch2_log_msg_start(c, &buf);
-
-       prt_str(&buf, "about to insert invalid key in data update path");
-       prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
-       prt_str(&buf, "\nold: ");
-       bch2_bkey_val_to_text(&buf, c, old);
-       prt_str(&buf, "\nk:   ");
-       bch2_bkey_val_to_text(&buf, c, k);
-       prt_str(&buf, "\nnew: ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-       prt_newline(&buf);
-
-       bch2_fs_emergency_read_only2(c, &buf);
-
-       bch2_print_str(c, KERN_ERR, buf.buf);
-       printbuf_exit(&buf);
-
-       return bch_err_throw(c, invalid_bkey);
-}
-
-static int __bch2_data_update_index_update(struct btree_trans *trans,
-                                          struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct btree_iter iter;
-       struct data_update *m = container_of(op, struct data_update, op);
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, &iter, m->btree_id,
-                            bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k),
-                            BTREE_ITER_slots|BTREE_ITER_intent);
-
-       while (1) {
-               struct bkey_s_c k;
-               struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
-               struct bkey_i *insert = NULL;
-               struct bkey_i_extent *new;
-               const union bch_extent_entry *entry_c;
-               union bch_extent_entry *entry;
-               struct extent_ptr_decoded p;
-               struct bch_extent_ptr *ptr;
-               const struct bch_extent_ptr *ptr_c;
-               struct bpos next_pos;
-               bool should_check_enospc;
-               s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-               unsigned rewrites_found = 0, durability, ptr_bit;
-
-               bch2_trans_begin(trans);
-
-               k = bch2_btree_iter_peek_slot(trans, &iter);
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys));
-
-               if (!bch2_extents_match(k, old)) {
-                       trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
-                                           NULL, "no match:");
-                       goto nowork;
-               }
-
-               insert = bch2_trans_kmalloc(trans,
-                                           bkey_bytes(k.k) +
-                                           bkey_val_bytes(&new->k) +
-                                           sizeof(struct bch_extent_rebalance));
-               ret = PTR_ERR_OR_ZERO(insert);
-               if (ret)
-                       goto err;
-
-               bkey_reassemble(insert, k);
-
-               new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k));
-               ret = PTR_ERR_OR_ZERO(new);
-               if (ret)
-                       goto err;
-
-               bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys));
-               bch2_cut_front(iter.pos, &new->k_i);
-
-               bch2_cut_front(iter.pos,        insert);
-               bch2_cut_back(new->k.p,         insert);
-               bch2_cut_back(insert->k.p,      &new->k_i);
-
-               /*
-                * @old: extent that we read from
-                * @insert: key that we're going to update, initialized from
-                * extent currently in btree - same as @old unless we raced with
-                * other updates
-                * @new: extent with new pointers that we'll be adding to @insert
-                *
-                * Fist, drop rewrite_ptrs from @new:
-                */
-               ptr_bit = 1;
-               bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
-                       if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
-                           (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
-                           !ptr->cached) {
-                               bch2_extent_ptr_set_cached(c, &m->op.opts,
-                                                          bkey_i_to_s(insert), ptr);
-                               rewrites_found |= ptr_bit;
-                       }
-                       ptr_bit <<= 1;
-               }
-
-               if (m->data_opts.rewrite_ptrs &&
-                   !rewrites_found &&
-                   bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
-                       trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
-                       goto nowork;
-               }
-
-               /*
-                * A replica that we just wrote might conflict with a replica
-                * that we want to keep, due to racing with another move:
-                */
-restart_drop_conflicting_replicas:
-               extent_for_each_ptr(extent_i_to_s(new), ptr)
-                       if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
-                           !ptr_c->cached) {
-                               bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
-                               goto restart_drop_conflicting_replicas;
-                       }
-
-               if (!bkey_val_u64s(&new->k)) {
-                       trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
-                       goto nowork;
-               }
-
-               /* Now, drop pointers that conflict with what we just wrote: */
-               extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
-                       if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
-                               bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
-
-               durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
-                       bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
-
-               /* Now, drop excess replicas: */
-               scoped_guard(rcu) {
-restart_drop_extra_replicas:
-                       bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
-                               unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
-
-                               if (!p.ptr.cached &&
-                                   durability - ptr_durability >= m->op.opts.data_replicas) {
-                                       durability -= ptr_durability;
-
-                                       bch2_extent_ptr_set_cached(c, &m->op.opts,
-                                                                  bkey_i_to_s(insert), &entry->ptr);
-                                       goto restart_drop_extra_replicas;
-                               }
-                       }
-               }
-
-               /* Finally, add the pointers we just wrote: */
-               extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
-                       bch2_extent_ptr_decoded_append(insert, &p);
-
-               bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
-               bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert));
-
-               ret = bch2_sum_sector_overwrites(trans, &iter, insert,
-                                                &should_check_enospc,
-                                                &i_sectors_delta,
-                                                &disk_sectors_delta);
-               if (ret)
-                       goto err;
-
-               if (disk_sectors_delta > (s64) op->res.sectors) {
-                       ret = bch2_disk_reservation_add(c, &op->res,
-                                               disk_sectors_delta - op->res.sectors,
-                                               !should_check_enospc
-                                               ? BCH_DISK_RESERVATION_NOFAIL : 0);
-                       if (ret)
-                               goto out;
-               }
-
-               next_pos = insert->k.p;
-
-               /*
-                * Check for nonce offset inconsistency:
-                * This is debug code - we've been seeing this bug rarely, and
-                * it's been hard to reproduce, so this should give us some more
-                * information when it does occur:
-                */
-               int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
-                                                (struct bkey_validate_context) {
-                                                       .btree  = m->btree_id,
-                                                       .flags  = BCH_VALIDATE_commit,
-                                                });
-               if (unlikely(invalid)) {
-                       ret = data_update_invalid_bkey(m, old, k, insert);
-                       goto out;
-               }
-
-               ret =   bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?:
-                       bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?:
-                       bch2_insert_snapshot_whiteouts(trans, m->btree_id,
-                                               k.k->p, bkey_start_pos(&insert->k)) ?:
-                       bch2_insert_snapshot_whiteouts(trans, m->btree_id,
-                                               k.k->p, insert->k.p) ?:
-                       bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
-                       bch2_trans_update(trans, &iter, insert,
-                               BTREE_UPDATE_internal_snapshot_node);
-               if (ret)
-                       goto err;
-
-               if (trace_data_update_enabled())
-                       trace_data_update2(m, old, k, insert);
-
-               if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
-                   bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
-                       trace_io_move_created_rebalance2(m, old, k, insert);
-
-               ret =   bch2_trans_commit(trans, &op->res,
-                               NULL,
-                               BCH_TRANS_COMMIT_no_check_rw|
-                               BCH_TRANS_COMMIT_no_enospc|
-                               m->data_opts.btree_insert_flags);
-               if (ret)
-                       goto err;
-
-               bch2_btree_iter_set_pos(trans, &iter, next_pos);
-
-               this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
-               if (trace_io_move_finish_enabled())
-                       trace_io_move_finish2(m, &new->k_i, insert);
-err:
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       ret = 0;
-               if (ret)
-                       break;
-next:
-               while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) {
-                       bch2_keylist_pop_front(&op->insert_keys);
-                       if (bch2_keylist_empty(&op->insert_keys))
-                               goto out;
-               }
-               continue;
-nowork:
-               if (m->stats) {
-                       BUG_ON(k.k->p.offset <= iter.pos.offset);
-                       atomic64_inc(&m->stats->keys_raced);
-                       atomic64_add(k.k->p.offset - iter.pos.offset,
-                                    &m->stats->sectors_raced);
-               }
-
-               count_event(c, io_move_fail);
-
-               bch2_btree_iter_advance(trans, &iter);
-               goto next;
-       }
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-       return ret;
-}
-
-int bch2_data_update_index_update(struct bch_write_op *op)
-{
-       return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
-}
-
-void bch2_data_update_read_done(struct data_update *m)
-{
-       m->read_done = true;
-
-       /* write bio must own pages: */
-       BUG_ON(!m->op.wbio.bio.bi_vcnt);
-
-       m->op.crc = m->rbio.pick.crc;
-       m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
-
-       this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size);
-
-       closure_call(&m->op.cl, bch2_write, NULL, NULL);
-}
-
-void bch2_data_update_exit(struct data_update *update)
-{
-       struct bch_fs *c = update->op.c;
-       struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
-
-       bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
-       kfree(update->bvecs);
-       update->bvecs = NULL;
-
-       if (c->opts.nocow_enabled)
-               bkey_nocow_unlock(c, k);
-       bkey_put_dev_refs(c, k);
-       bch2_disk_reservation_put(c, &update->op.res);
-       bch2_bkey_buf_exit(&update->k, c);
-}
-
-static noinline_for_stack
-int bch2_update_unwritten_extent(struct btree_trans *trans,
-                                struct data_update *update)
-{
-       struct bch_fs *c = update->op.c;
-       struct bkey_i_extent *e;
-       struct write_point *wp;
-       struct closure cl;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       closure_init_stack(&cl);
-       bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
-
-       while (bpos_lt(update->op.pos, update->k.k->k.p)) {
-               unsigned sectors = update->k.k->k.p.offset -
-                       update->op.pos.offset;
-
-               bch2_trans_begin(trans);
-
-               bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
-                                    BTREE_ITER_slots);
-               ret = lockrestart_do(trans, ({
-                       k = bch2_btree_iter_peek_slot(trans, &iter);
-                       bkey_err(k);
-               }));
-               bch2_trans_iter_exit(trans, &iter);
-
-               if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
-                       break;
-
-               e = bkey_extent_init(update->op.insert_keys.top);
-               e->k.p = update->op.pos;
-
-               ret = bch2_alloc_sectors_start_trans(trans,
-                               update->op.target,
-                               false,
-                               update->op.write_point,
-                               &update->op.devs_have,
-                               update->op.nr_replicas,
-                               update->op.nr_replicas,
-                               update->op.watermark,
-                               0, &cl, &wp);
-               if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
-                       bch2_trans_unlock(trans);
-                       closure_sync(&cl);
-                       continue;
-               }
-
-               bch_err_fn_ratelimited(c, ret);
-
-               if (ret)
-                       break;
-
-               sectors = min(sectors, wp->sectors_free);
-
-               bch2_key_resize(&e->k, sectors);
-
-               bch2_open_bucket_get(c, wp, &update->op.open_buckets);
-               bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
-               bch2_alloc_sectors_done(c, wp);
-
-               update->op.pos.offset += sectors;
-
-               extent_for_each_ptr(extent_i_to_s(e), ptr)
-                       ptr->unwritten = true;
-               bch2_keylist_push(&update->op.insert_keys);
-
-               ret = __bch2_data_update_index_update(trans, &update->op);
-
-               bch2_open_buckets_put(c, &update->op.open_buckets);
-
-               if (ret)
-                       break;
-       }
-
-       if (closure_nr_remaining(&cl) != 1) {
-               bch2_trans_unlock(trans);
-               closure_sync(&cl);
-       }
-
-       return ret;
-}
-
-void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
-                                  struct bch_io_opts *io_opts,
-                                  struct data_update_opts *data_opts)
-{
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 20);
-
-       prt_str_indented(out, "rewrite ptrs:\t");
-       bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
-       prt_newline(out);
-
-       prt_str_indented(out, "kill ptrs:\t");
-       bch2_prt_u64_base2(out, data_opts->kill_ptrs);
-       prt_newline(out);
-
-       prt_str_indented(out, "target:\t");
-       bch2_target_to_text(out, c, data_opts->target);
-       prt_newline(out);
-
-       prt_str_indented(out, "compression:\t");
-       bch2_compression_opt_to_text(out, io_opts->background_compression);
-       prt_newline(out);
-
-       prt_str_indented(out, "opts.replicas:\t");
-       prt_u64(out, io_opts->data_replicas);
-       prt_newline(out);
-
-       prt_str_indented(out, "extra replicas:\t");
-       prt_u64(out, data_opts->extra_replicas);
-       prt_newline(out);
-
-       prt_str_indented(out, "scrub:\t");
-       prt_u64(out, data_opts->scrub);
-}
-
-void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
-{
-       prt_str(out, bch2_data_update_type_strs[m->type]);
-       prt_newline(out);
-
-       bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
-       prt_newline(out);
-
-       prt_str_indented(out, "old key:\t");
-       bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
-}
-
-void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m)
-{
-       bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
-       prt_newline(out);
-       printbuf_indent_add(out, 2);
-       bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
-
-       if (!m->read_done) {
-               prt_printf(out, "read:\n");
-               printbuf_indent_add(out, 2);
-               bch2_read_bio_to_text(out, &m->rbio);
-       } else {
-               prt_printf(out, "write:\n");
-               printbuf_indent_add(out, 2);
-               bch2_write_op_to_text(out, &m->op);
-       }
-       printbuf_indent_sub(out, 4);
-}
-
-int bch2_extent_drop_ptrs(struct btree_trans *trans,
-                         struct btree_iter *iter,
-                         struct bkey_s_c k,
-                         struct bch_io_opts *io_opts,
-                         struct data_update_opts *data_opts)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_i *n;
-       int ret;
-
-       n = bch2_bkey_make_mut_noupdate(trans, k);
-       ret = PTR_ERR_OR_ZERO(n);
-       if (ret)
-               return ret;
-
-       while (data_opts->kill_ptrs) {
-               unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
-
-               bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
-               data_opts->kill_ptrs ^= 1U << drop;
-       }
-
-       /*
-        * If the new extent no longer has any pointers, bch2_extent_normalize()
-        * will do the appropriate thing with it (turning it into a
-        * KEY_TYPE_error key, or just a discard if it was a cached extent)
-        */
-       bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n));
-
-       /*
-        * Since we're not inserting through an extent iterator
-        * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
-        * we aren't using the extent overwrite path to delete, we're
-        * just using the normal key deletion path:
-        */
-       if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents))
-               n->k.size = 0;
-
-       return bch2_trans_relock(trans) ?:
-               bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
-               bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
-                                       struct bch_io_opts *io_opts,
-                                       unsigned buf_bytes)
-{
-       unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
-
-       m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
-       if (!m->bvecs)
-               return -ENOMEM;
-
-       bio_init(&m->rbio.bio,          NULL, m->bvecs, nr_vecs, REQ_OP_READ);
-       bio_init(&m->op.wbio.bio,       NULL, m->bvecs, nr_vecs, 0);
-
-       if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
-               kfree(m->bvecs);
-               m->bvecs = NULL;
-               return -ENOMEM;
-       }
-
-       rbio_init(&m->rbio.bio, c, *io_opts, NULL);
-       m->rbio.data_update             = true;
-       m->rbio.bio.bi_iter.bi_size     = buf_bytes;
-       m->rbio.bio.bi_iter.bi_sector   = bkey_start_offset(&m->k.k->k);
-       m->op.wbio.bio.bi_ioprio        = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
-       return 0;
-}
-
-int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
-                              struct bch_io_opts *io_opts)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-
-       /* write path might have to decompress data: */
-       unsigned buf_bytes = 0;
-       bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
-               buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
-
-       return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
-}
-
-static int can_write_extent(struct bch_fs *c, struct data_update *m)
-{
-       if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
-           unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
-               return bch_err_throw(c, data_update_done_would_block);
-
-       unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
-               ? m->op.target
-               : 0;
-       struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
-
-       darray_for_each(m->op.devs_have, i)
-               __clear_bit(*i, devs.d);
-
-       guard(rcu)();
-
-       unsigned nr_replicas = 0, i;
-       for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
-               struct bch_dev *ca = bch2_dev_rcu_noerror(c, i);
-               if (!ca)
-                       continue;
-
-               struct bch_dev_usage usage;
-               bch2_dev_usage_read_fast(ca, &usage);
-
-               if (!dev_buckets_free(ca, usage, m->op.watermark))
-                       continue;
-
-               nr_replicas += ca->mi.durability;
-               if (nr_replicas >= m->op.nr_replicas)
-                       break;
-       }
-
-       if (!nr_replicas)
-               return bch_err_throw(c, data_update_done_no_rw_devs);
-       if (nr_replicas < m->op.nr_replicas)
-               return bch_err_throw(c, insufficient_devices);
-       return 0;
-}
-
-int bch2_data_update_init(struct btree_trans *trans,
-                         struct btree_iter *iter,
-                         struct moving_context *ctxt,
-                         struct data_update *m,
-                         struct write_point_specifier wp,
-                         struct bch_io_opts *io_opts,
-                         struct data_update_opts data_opts,
-                         enum btree_id btree_id,
-                         struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       int ret = 0;
-
-       if (k.k->p.snapshot) {
-               ret = bch2_check_key_has_snapshot(trans, iter, k);
-               if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
-                       /* Can't repair yet, waiting on other recovery passes */
-                       return bch_err_throw(c, data_update_done_no_snapshot);
-               }
-               if (ret < 0)
-                       return ret;
-               if (ret) /* key was deleted */
-                       return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-                               bch_err_throw(c, data_update_done_no_snapshot);
-               ret = 0;
-       }
-
-       bch2_bkey_buf_init(&m->k);
-       bch2_bkey_buf_reassemble(&m->k, c, k);
-       m->type         = data_opts.btree_insert_flags & BCH_WATERMARK_copygc
-               ? BCH_DATA_UPDATE_copygc
-               : BCH_DATA_UPDATE_rebalance;
-       m->btree_id     = btree_id;
-       m->data_opts    = data_opts;
-       m->ctxt         = ctxt;
-       m->stats        = ctxt ? ctxt->stats : NULL;
-
-       bch2_write_op_init(&m->op, c, *io_opts);
-       m->op.pos       = bkey_start_pos(k.k);
-       m->op.version   = k.k->bversion;
-       m->op.target    = data_opts.target;
-       m->op.write_point = wp;
-       m->op.nr_replicas = 0;
-       m->op.flags     |= BCH_WRITE_pages_stable|
-               BCH_WRITE_pages_owned|
-               BCH_WRITE_data_encoded|
-               BCH_WRITE_move|
-               m->data_opts.write_flags;
-       m->op.compression_opt   = io_opts->background_compression;
-       m->op.watermark         = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
-
-       unsigned durability_have = 0, durability_removing = 0;
-
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
-       unsigned buf_bytes = 0;
-       bool unwritten = false;
-
-       unsigned ptr_bit = 1;
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               if (!p.ptr.cached) {
-                       guard(rcu)();
-                       if (ptr_bit & m->data_opts.rewrite_ptrs) {
-                               if (crc_is_compressed(p.crc))
-                                       reserve_sectors += k.k->size;
-
-                               m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
-                               durability_removing += bch2_extent_ptr_desired_durability(c, &p);
-                       } else if (!(ptr_bit & m->data_opts.kill_ptrs)) {
-                               bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
-                               durability_have += bch2_extent_ptr_durability(c, &p);
-                       }
-               }
-
-               /*
-                * op->csum_type is normally initialized from the fs/file's
-                * current options - but if an extent is encrypted, we require
-                * that it stays encrypted:
-                */
-               if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
-                       m->op.nonce     = p.crc.nonce + p.crc.offset;
-                       m->op.csum_type = p.crc.csum_type;
-               }
-
-               if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
-                       m->op.incompressible = true;
-
-               buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
-               unwritten |= p.ptr.unwritten;
-
-               ptr_bit <<= 1;
-       }
-
-       unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
-
-       /*
-        * If current extent durability is less than io_opts.data_replicas,
-        * we're not trying to rereplicate the extent up to data_replicas here -
-        * unless extra_replicas was specified
-        *
-        * Increasing replication is an explicit operation triggered by
-        * rereplicate, currently, so that users don't get an unexpected -ENOSPC
-        */
-       m->op.nr_replicas = min(durability_removing, durability_required) +
-               m->data_opts.extra_replicas;
-
-       /*
-        * If device(s) were set to durability=0 after data was written to them
-        * we can end up with a duribilty=0 extent, and the normal algorithm
-        * that tries not to increase durability doesn't work:
-        */
-       if (!(durability_have + durability_removing))
-               m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
-
-       m->op.nr_replicas_required = m->op.nr_replicas;
-
-       /*
-        * It might turn out that we don't need any new replicas, if the
-        * replicas or durability settings have been changed since the extent
-        * was written:
-        */
-       if (!m->op.nr_replicas) {
-               m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
-               m->data_opts.rewrite_ptrs = 0;
-               /* if iter == NULL, it's just a promote */
-               if (iter)
-                       ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
-               if (!ret)
-                       ret = bch_err_throw(c, data_update_done_no_writes_needed);
-               goto out_bkey_buf_exit;
-       }
-
-       /*
-        * Check if the allocation will succeed, to avoid getting an error later
-        * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
-        * read:
-        *
-        * This guards against
-        * - BCH_WRITE_alloc_nowait allocations failing (promotes)
-        * - Destination target full
-        * - Device(s) in destination target offline
-        * - Insufficient durability available in destination target
-        *   (i.e. trying to move a durability=2 replica to a target with a
-        *   single durability=2 device)
-        */
-       ret = can_write_extent(c, m);
-       if (ret)
-               goto out_bkey_buf_exit;
-
-       if (reserve_sectors) {
-               ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
-                               m->data_opts.extra_replicas
-                               ? 0
-                               : BCH_DISK_RESERVATION_NOFAIL);
-               if (ret)
-                       goto out_bkey_buf_exit;
-       }
-
-       if (!bkey_get_dev_refs(c, k)) {
-               ret = bch_err_throw(c, data_update_done_no_dev_refs);
-               goto out_put_disk_res;
-       }
-
-       if (c->opts.nocow_enabled &&
-           !bkey_nocow_lock(c, ctxt, ptrs)) {
-               ret = bch_err_throw(c, nocow_lock_blocked);
-               goto out_put_dev_refs;
-       }
-
-       if (unwritten) {
-               ret = bch2_update_unwritten_extent(trans, m) ?:
-                       bch_err_throw(c, data_update_done_unwritten);
-               goto out_nocow_unlock;
-       }
-
-       bch2_trans_unlock(trans);
-
-       ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
-       if (ret)
-               goto out_nocow_unlock;
-
-       return 0;
-out_nocow_unlock:
-       if (c->opts.nocow_enabled)
-               bkey_nocow_unlock(c, k);
-out_put_dev_refs:
-       bkey_put_dev_refs(c, k);
-out_put_disk_res:
-       bch2_disk_reservation_put(c, &m->op.res);
-out_bkey_buf_exit:
-       bch2_bkey_buf_exit(&m->k, c);
-       return ret;
-}
-
-void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       unsigned ptr_bit = 1;
-
-       bkey_for_each_ptr(ptrs, ptr) {
-               if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
-                       opts->kill_ptrs |= ptr_bit;
-                       opts->rewrite_ptrs ^= ptr_bit;
-               }
-
-               ptr_bit <<= 1;
-       }
-}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
deleted file mode 100644 (file)
index 5e14d13..0000000
+++ /dev/null
@@ -1,93 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _BCACHEFS_DATA_UPDATE_H
-#define _BCACHEFS_DATA_UPDATE_H
-
-#include "bkey_buf.h"
-#include "io_read.h"
-#include "io_write_types.h"
-
-struct moving_context;
-
-struct data_update_opts {
-       unsigned        rewrite_ptrs;
-       unsigned        kill_ptrs;
-       u16             target;
-       u8              extra_replicas;
-       unsigned        btree_insert_flags;
-       unsigned        write_flags;
-
-       int             read_dev;
-       bool            scrub;
-};
-
-void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
-                                  struct bch_io_opts *, struct data_update_opts *);
-
-#define BCH_DATA_UPDATE_TYPES()                \
-       x(copygc,       0)              \
-       x(rebalance,    1)              \
-       x(promote,      2)
-
-enum bch_data_update_types {
-#define x(n, id)       BCH_DATA_UPDATE_##n = id,
-       BCH_DATA_UPDATE_TYPES()
-#undef x
-};
-
-struct data_update {
-       enum bch_data_update_types type;
-       /* extent being updated: */
-       bool                    read_done;
-       enum btree_id           btree_id;
-       struct bkey_buf         k;
-       struct data_update_opts data_opts;
-       struct moving_context   *ctxt;
-       struct bch_move_stats   *stats;
-
-       struct bch_read_bio     rbio;
-       struct bch_write_op     op;
-       struct bio_vec          *bvecs;
-};
-
-struct promote_op {
-       struct rcu_head         rcu;
-       u64                     start_time;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-       unsigned                list_idx;
-#endif
-
-       struct rhash_head       hash;
-       struct bpos             pos;
-
-       struct work_struct      work;
-       struct data_update      write;
-       struct bio_vec          bi_inline_vecs[]; /* must be last */
-};
-
-void bch2_data_update_to_text(struct printbuf *, struct data_update *);
-void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
-
-int bch2_data_update_index_update(struct bch_write_op *);
-
-void bch2_data_update_read_done(struct data_update *);
-
-int bch2_extent_drop_ptrs(struct btree_trans *,
-                         struct btree_iter *,
-                         struct bkey_s_c,
-                         struct bch_io_opts *,
-                         struct data_update_opts *);
-
-int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
-                              struct bch_io_opts *);
-
-void bch2_data_update_exit(struct data_update *);
-int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
-                         struct moving_context *,
-                         struct data_update *,
-                         struct write_point_specifier,
-                         struct bch_io_opts *, struct data_update_opts,
-                         enum btree_id, struct bkey_s_c);
-void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
-
-#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
deleted file mode 100644 (file)
index 07c2a0f..0000000
+++ /dev/null
@@ -1,996 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Assorted bcachefs debug code
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "data_update.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal_reclaim.h"
-#include "super.h"
-
-#include <linux/console.h>
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-
-static struct dentry *bch_debug;
-
-static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
-                                     struct extent_ptr_decoded pick)
-{
-       struct btree *v = c->verify_data;
-       struct btree_node *n_ondisk = c->verify_ondisk;
-       struct btree_node *n_sorted = c->verify_data->data;
-       struct bset *sorted, *inmemory = &b->data->keys;
-       struct bio *bio;
-       bool failed = false;
-
-       struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
-                               BCH_DEV_READ_REF_btree_verify_replicas);
-       if (!ca)
-               return false;
-
-       bio = bio_alloc_bioset(ca->disk_sb.bdev,
-                              buf_pages(n_sorted, btree_buf_bytes(b)),
-                              REQ_OP_READ|REQ_META,
-                              GFP_NOFS,
-                              &c->btree_bio);
-       bio->bi_iter.bi_sector  = pick.ptr.offset;
-       bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
-
-       submit_bio_wait(bio);
-
-       bio_put(bio);
-       enumerated_ref_put(&ca->io_ref[READ],
-                          BCH_DEV_READ_REF_btree_verify_replicas);
-
-       memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
-
-       v->written = 0;
-       if (bch2_btree_node_read_done(c, ca, v, NULL, NULL))
-               return false;
-
-       n_sorted = c->verify_data->data;
-       sorted = &n_sorted->keys;
-
-       if (inmemory->u64s != sorted->u64s ||
-           memcmp(inmemory->start,
-                  sorted->start,
-                  vstruct_end(inmemory) - (void *) inmemory->start)) {
-               unsigned offset = 0, sectors;
-               struct bset *i;
-               unsigned j;
-
-               console_lock();
-
-               printk(KERN_ERR "*** in memory:\n");
-               bch2_dump_bset(c, b, inmemory, 0);
-
-               printk(KERN_ERR "*** read back in:\n");
-               bch2_dump_bset(c, v, sorted, 0);
-
-               while (offset < v->written) {
-                       if (!offset) {
-                               i = &n_ondisk->keys;
-                               sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
-                                       c->block_bits;
-                       } else {
-                               struct btree_node_entry *bne =
-                                       (void *) n_ondisk + (offset << 9);
-                               i = &bne->keys;
-
-                               sectors = vstruct_blocks(bne, c->block_bits) <<
-                                       c->block_bits;
-                       }
-
-                       printk(KERN_ERR "*** on disk block %u:\n", offset);
-                       bch2_dump_bset(c, b, i, offset);
-
-                       offset += sectors;
-               }
-
-               for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
-                       if (inmemory->_data[j] != sorted->_data[j])
-                               break;
-
-               console_unlock();
-               bch_err(c, "verify failed at key %u", j);
-
-               failed = true;
-       }
-
-       if (v->written != b->written) {
-               bch_err(c, "written wrong: expected %u, got %u",
-                       b->written, v->written);
-               failed = true;
-       }
-
-       return failed;
-}
-
-void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
-{
-       struct bkey_ptrs_c ptrs;
-       struct extent_ptr_decoded p;
-       const union bch_extent_entry *entry;
-       struct btree *v;
-       struct bset *inmemory = &b->data->keys;
-       struct bkey_packed *k;
-       bool failed = false;
-
-       if (c->opts.nochanges)
-               return;
-
-       bch2_btree_node_io_lock(b);
-       mutex_lock(&c->verify_lock);
-
-       if (!c->verify_ondisk) {
-               c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
-               if (!c->verify_ondisk)
-                       goto out;
-       }
-
-       if (!c->verify_data) {
-               c->verify_data = __bch2_btree_node_mem_alloc(c);
-               if (!c->verify_data)
-                       goto out;
-       }
-
-       BUG_ON(b->nsets != 1);
-
-       for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
-               if (k->type == KEY_TYPE_btree_ptr_v2)
-                       ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
-
-       v = c->verify_data;
-       bkey_copy(&v->key, &b->key);
-       v->c.level      = b->c.level;
-       v->c.btree_id   = b->c.btree_id;
-       bch2_btree_keys_init(v);
-
-       ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
-       bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
-               failed |= bch2_btree_verify_replica(c, b, p);
-
-       if (failed) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-               bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
-               printbuf_exit(&buf);
-       }
-out:
-       mutex_unlock(&c->verify_lock);
-       bch2_btree_node_io_unlock(b);
-}
-
-void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
-                                   const struct btree *b)
-{
-       struct btree_node *n_ondisk = NULL;
-       struct extent_ptr_decoded pick;
-       struct bch_dev *ca;
-       struct bio *bio = NULL;
-       unsigned offset = 0;
-       int ret;
-
-       if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) {
-               prt_printf(out, "error getting device to read from: invalid device\n");
-               return;
-       }
-
-       ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
-                       BCH_DEV_READ_REF_btree_node_ondisk_to_text);
-       if (!ca) {
-               prt_printf(out, "error getting device to read from: not online\n");
-               return;
-       }
-
-       n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
-       if (!n_ondisk) {
-               prt_printf(out, "memory allocation failure\n");
-               goto out;
-       }
-
-       bio = bio_alloc_bioset(ca->disk_sb.bdev,
-                              buf_pages(n_ondisk, btree_buf_bytes(b)),
-                              REQ_OP_READ|REQ_META,
-                              GFP_NOFS,
-                              &c->btree_bio);
-       bio->bi_iter.bi_sector  = pick.ptr.offset;
-       bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
-
-       ret = submit_bio_wait(bio);
-       if (ret) {
-               prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
-               goto out;
-       }
-
-       while (offset < btree_sectors(c)) {
-               struct bset *i;
-               struct nonce nonce;
-               struct bch_csum csum;
-               struct bkey_packed *k;
-               unsigned sectors;
-
-               if (!offset) {
-                       i = &n_ondisk->keys;
-
-                       if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
-                               prt_printf(out, "unknown checksum type at offset %u: %llu\n",
-                                          offset, BSET_CSUM_TYPE(i));
-                               goto out;
-                       }
-
-                       nonce = btree_nonce(i, offset << 9);
-                       csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
-
-                       if (bch2_crc_cmp(csum, n_ondisk->csum)) {
-                               prt_printf(out, "invalid checksum\n");
-                               goto out;
-                       }
-
-                       bset_encrypt(c, i, offset << 9);
-
-                       sectors = vstruct_sectors(n_ondisk, c->block_bits);
-               } else {
-                       struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
-
-                       i = &bne->keys;
-
-                       if (i->seq != n_ondisk->keys.seq)
-                               break;
-
-                       if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
-                               prt_printf(out, "unknown checksum type at offset %u: %llu\n",
-                                          offset, BSET_CSUM_TYPE(i));
-                               goto out;
-                       }
-
-                       nonce = btree_nonce(i, offset << 9);
-                       csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
-                       if (bch2_crc_cmp(csum, bne->csum)) {
-                               prt_printf(out, "invalid checksum");
-                               goto out;
-                       }
-
-                       bset_encrypt(c, i, offset << 9);
-
-                       sectors = vstruct_sectors(bne, c->block_bits);
-               }
-
-               prt_printf(out, "  offset %u version %u, journal seq %llu\n",
-                          offset,
-                          le16_to_cpu(i->version),
-                          le64_to_cpu(i->journal_seq));
-               offset += sectors;
-
-               printbuf_indent_add(out, 4);
-
-               for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
-                       struct bkey u;
-
-                       bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
-                       prt_newline(out);
-               }
-
-               printbuf_indent_sub(out, 4);
-       }
-out:
-       if (bio)
-               bio_put(bio);
-       kvfree(n_ondisk);
-       enumerated_ref_put(&ca->io_ref[READ],
-                          BCH_DEV_READ_REF_btree_node_ondisk_to_text);
-}
-
-#ifdef CONFIG_DEBUG_FS
-
-ssize_t bch2_debugfs_flush_buf(struct dump_iter *i)
-{
-       if (i->buf.pos) {
-               size_t bytes = min_t(size_t, i->buf.pos, i->size);
-               int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
-
-               i->ret   += copied;
-               i->ubuf  += copied;
-               i->size  -= copied;
-               i->buf.pos -= copied;
-               memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
-
-               if (i->buf.last_newline >= copied)
-                       i->buf.last_newline -= copied;
-               if (i->buf.last_field >= copied)
-                       i->buf.last_field -= copied;
-
-               if (copied != bytes)
-                       return -EFAULT;
-       }
-
-       return i->size ? 0 : i->ret;
-}
-
-static int bch2_dump_open(struct inode *inode, struct file *file)
-{
-       struct btree_debug *bd = inode->i_private;
-       struct dump_iter *i;
-
-       i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-       if (!i)
-               return -ENOMEM;
-
-       file->private_data = i;
-       i->from = POS_MIN;
-       i->iter = 0;
-       i->c    = container_of(bd, struct bch_fs, btree_debug[bd->id]);
-       i->id   = bd->id;
-       i->buf  = PRINTBUF;
-
-       return 0;
-}
-
-int bch2_dump_release(struct inode *inode, struct file *file)
-{
-       struct dump_iter *i = file->private_data;
-
-       printbuf_exit(&i->buf);
-       kfree(i);
-       return 0;
-}
-
-static ssize_t bch2_read_btree(struct file *file, char __user *buf,
-                              size_t size, loff_t *ppos)
-{
-       struct dump_iter *i = file->private_data;
-
-       i->ubuf = buf;
-       i->size = size;
-       i->ret  = 0;
-
-       return bch2_debugfs_flush_buf(i) ?:
-               bch2_trans_run(i->c,
-                       for_each_btree_key(trans, iter, i->id, i->from,
-                                          BTREE_ITER_prefetch|
-                                          BTREE_ITER_all_snapshots, k, ({
-                               bch2_bkey_val_to_text(&i->buf, i->c, k);
-                               prt_newline(&i->buf);
-                               bch2_trans_unlock(trans);
-                               i->from = bpos_successor(iter.pos);
-                               bch2_debugfs_flush_buf(i);
-                       }))) ?:
-               i->ret;
-}
-
-static const struct file_operations btree_debug_ops = {
-       .owner          = THIS_MODULE,
-       .open           = bch2_dump_open,
-       .release        = bch2_dump_release,
-       .read           = bch2_read_btree,
-};
-
-static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
-                                      size_t size, loff_t *ppos)
-{
-       struct dump_iter *i = file->private_data;
-
-       i->ubuf = buf;
-       i->size = size;
-       i->ret  = 0;
-
-       ssize_t ret = bch2_debugfs_flush_buf(i);
-       if (ret)
-               return ret;
-
-       if (bpos_eq(SPOS_MAX, i->from))
-               return i->ret;
-
-       return bch2_trans_run(i->c,
-               for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({
-                       bch2_btree_node_to_text(&i->buf, i->c, b);
-                       i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
-                               ? bpos_successor(b->key.k.p)
-                               : b->key.k.p;
-
-                       drop_locks_do(trans, bch2_debugfs_flush_buf(i));
-               }))) ?: i->ret;
-}
-
-static const struct file_operations btree_format_debug_ops = {
-       .owner          = THIS_MODULE,
-       .open           = bch2_dump_open,
-       .release        = bch2_dump_release,
-       .read           = bch2_read_btree_formats,
-};
-
-static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
-                                      size_t size, loff_t *ppos)
-{
-       struct dump_iter *i = file->private_data;
-
-       i->ubuf = buf;
-       i->size = size;
-       i->ret  = 0;
-
-       return bch2_debugfs_flush_buf(i) ?:
-               bch2_trans_run(i->c,
-                       for_each_btree_key(trans, iter, i->id, i->from,
-                                          BTREE_ITER_prefetch|
-                                          BTREE_ITER_all_snapshots, k, ({
-                               struct btree_path_level *l =
-                                       &btree_iter_path(trans, &iter)->l[0];
-                               struct bkey_packed *_k =
-                                       bch2_btree_node_iter_peek(&l->iter, l->b);
-
-                               if (bpos_gt(l->b->key.k.p, i->prev_node)) {
-                                       bch2_btree_node_to_text(&i->buf, i->c, l->b);
-                                       i->prev_node = l->b->key.k.p;
-                               }
-
-                               bch2_bfloat_to_text(&i->buf, l->b, _k);
-                               bch2_trans_unlock(trans);
-                               i->from = bpos_successor(iter.pos);
-                               bch2_debugfs_flush_buf(i);
-                       }))) ?:
-               i->ret;
-}
-
-static const struct file_operations bfloat_failed_debug_ops = {
-       .owner          = THIS_MODULE,
-       .open           = bch2_dump_open,
-       .release        = bch2_dump_release,
-       .read           = bch2_read_bfloat_failed,
-};
-
-static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
-                                          struct btree *b)
-{
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 32);
-
-       prt_printf(out, "%px ", b);
-       bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level);
-       prt_printf(out, "\n");
-
-       printbuf_indent_add(out, 2);
-
-       bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-       prt_newline(out);
-
-       prt_printf(out, "flags:\t");
-       prt_bitflags(out, bch2_btree_node_flags, b->flags);
-       prt_newline(out);
-
-       prt_printf(out, "pcpu read locks:\t%u\n",       b->c.lock.readers != NULL);
-       prt_printf(out, "written:\t%u\n",               b->written);
-       prt_printf(out, "writes blocked:\t%u\n",        !list_empty_careful(&b->write_blocked));
-       prt_printf(out, "will make reachable:\t%lx\n",  b->will_make_reachable);
-
-       prt_printf(out, "journal pin %px:\t%llu\n",
-                  &b->writes[0].journal, b->writes[0].journal.seq);
-       prt_printf(out, "journal pin %px:\t%llu\n",
-                  &b->writes[1].journal, b->writes[1].journal.seq);
-
-       prt_printf(out, "ob:\t%u\n", b->ob.nr);
-
-       printbuf_indent_sub(out, 2);
-}
-
-static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
-                                           size_t size, loff_t *ppos)
-{
-       struct dump_iter *i = file->private_data;
-       struct bch_fs *c = i->c;
-       bool done = false;
-       ssize_t ret = 0;
-
-       i->ubuf = buf;
-       i->size = size;
-       i->ret  = 0;
-
-       do {
-               ret = bch2_debugfs_flush_buf(i);
-               if (ret)
-                       return ret;
-
-               i->buf.atomic++;
-               scoped_guard(rcu) {
-                       struct bucket_table *tbl =
-                               rht_dereference_rcu(c->btree_cache.table.tbl,
-                                                   &c->btree_cache.table);
-                       if (i->iter < tbl->size) {
-                               struct rhash_head *pos;
-                               struct btree *b;
-
-                               rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
-                                       bch2_cached_btree_node_to_text(&i->buf, c, b);
-                               i->iter++;
-                       } else {
-                               done = true;
-                       }
-               }
-               --i->buf.atomic;
-       } while (!done);
-
-       if (i->buf.allocation_failure)
-               ret = -ENOMEM;
-
-       if (!ret)
-               ret = bch2_debugfs_flush_buf(i);
-
-       return ret ?: i->ret;
-}
-
-static const struct file_operations cached_btree_nodes_ops = {
-       .owner          = THIS_MODULE,
-       .open           = bch2_dump_open,
-       .release        = bch2_dump_release,
-       .read           = bch2_cached_btree_nodes_read,
-};
-
-typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
-
-static void list_sort(struct list_head *head, list_cmp_fn cmp)
-{
-       struct list_head *pos;
-
-       list_for_each(pos, head)
-               while (!list_is_last(pos, head) &&
-                      cmp(pos, pos->next) > 0) {
-                       struct list_head *pos2, *next = pos->next;
-
-                       list_del(next);
-                       list_for_each(pos2, head)
-                               if (cmp(next, pos2) < 0)
-                                       goto pos_found;
-                       BUG();
-pos_found:
-                       list_add_tail(next, pos2);
-               }
-}
-
-static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
-{
-       return cmp_int(l, r);
-}
-
-static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
-                                           size_t size, loff_t *ppos)
-{
-       struct dump_iter *i = file->private_data;
-       struct bch_fs *c = i->c;
-       struct btree_trans *trans;
-       ssize_t ret = 0;
-
-       i->ubuf = buf;
-       i->size = size;
-       i->ret  = 0;
-
-       int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-restart:
-       seqmutex_lock(&c->btree_trans_lock);
-       list_sort(&c->btree_trans_list, list_ptr_order_cmp);
-
-       list_for_each_entry(trans, &c->btree_trans_list, list) {
-               if ((ulong) trans <= i->iter)
-                       continue;
-
-               i->iter = (ulong) trans;
-
-               if (!closure_get_not_zero(&trans->ref))
-                       continue;
-
-               if (!trans->srcu_held) {
-                       closure_put(&trans->ref);
-                       continue;
-               }
-
-               u32 seq = seqmutex_unlock(&c->btree_trans_lock);
-
-               bch2_btree_trans_to_text(&i->buf, trans);
-
-               prt_printf(&i->buf, "backtrace:\n");
-               printbuf_indent_add(&i->buf, 2);
-               bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
-               printbuf_indent_sub(&i->buf, 2);
-               prt_newline(&i->buf);
-
-               closure_put(&trans->ref);
-
-               ret = bch2_debugfs_flush_buf(i);
-               if (ret)
-                       goto unlocked;
-
-               if (!seqmutex_relock(&c->btree_trans_lock, seq))
-                       goto restart;
-       }
-       seqmutex_unlock(&c->btree_trans_lock);
-unlocked:
-       srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
-       if (i->buf.allocation_failure)
-               ret = -ENOMEM;
-
-       if (!ret)
-               ret = bch2_debugfs_flush_buf(i);
-
-       return ret ?: i->ret;
-}
-
-static const struct file_operations btree_transactions_ops = {
-       .owner          = THIS_MODULE,
-       .open           = bch2_dump_open,
-       .release        = bch2_dump_release,
-       .read           = bch2_btree_transactions_read,
-};
-
-static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
-                                     size_t size, loff_t *ppos)
-{
-       struct dump_iter *i = file->private_data;
-       struct bch_fs *c = i->c;
-       bool done = false;
-       int err;
-
-       i->ubuf = buf;
-       i->size = size;
-       i->ret  = 0;
-
-       while (1) {
-               err = bch2_debugfs_flush_buf(i);
-               if (err)
-                       return err;
-
-               if (!i->size)
-                       break;
-
-               if (done)
-                       break;
-
-               done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
-               i->iter++;
-       }
-
-       if (i->buf.allocation_failure)
-               return -ENOMEM;
-
-       return i->ret;
-}
-
-static const struct file_operations journal_pins_ops = {
-       .owner          = THIS_MODULE,
-       .open           = bch2_dump_open,
-       .release        = bch2_dump_release,
-       .read           = bch2_journal_pins_read,
-};
-
-static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
-                                      size_t size, loff_t *ppos)
-{
-       struct dump_iter *i = file->private_data;
-       struct bch_fs *c = i->c;
-       int err;
-
-       i->ubuf = buf;
-       i->size = size;
-       i->ret  = 0;
-
-       if (!i->iter) {
-               bch2_btree_updates_to_text(&i->buf, c);
-               i->iter++;
-       }
-
-       err = bch2_debugfs_flush_buf(i);
-       if (err)
-               return err;
-
-       if (i->buf.allocation_failure)
-               return -ENOMEM;
-
-       return i->ret;
-}
-
-static const struct file_operations btree_updates_ops = {
-       .owner          = THIS_MODULE,
-       .open           = bch2_dump_open,
-       .release        = bch2_dump_release,
-       .read           = bch2_btree_updates_read,
-};
-
-static int btree_transaction_stats_open(struct inode *inode, struct file *file)
-{
-       struct bch_fs *c = inode->i_private;
-       struct dump_iter *i;
-
-       i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-       if (!i)
-               return -ENOMEM;
-
-       i->iter = 1;
-       i->c    = c;
-       i->buf  = PRINTBUF;
-       file->private_data = i;
-
-       return 0;
-}
-
-static int btree_transaction_stats_release(struct inode *inode, struct file *file)
-{
-       struct dump_iter *i = file->private_data;
-
-       printbuf_exit(&i->buf);
-       kfree(i);
-
-       return 0;
-}
-
-static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
-                                           size_t size, loff_t *ppos)
-{
-       struct dump_iter        *i = file->private_data;
-       struct bch_fs *c = i->c;
-       int err;
-
-       i->ubuf = buf;
-       i->size = size;
-       i->ret  = 0;
-
-       while (1) {
-               struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
-
-               err = bch2_debugfs_flush_buf(i);
-               if (err)
-                       return err;
-
-               if (!i->size)
-                       break;
-
-               if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
-                   !bch2_btree_transaction_fns[i->iter])
-                       break;
-
-               prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]);
-               printbuf_indent_add(&i->buf, 2);
-
-               mutex_lock(&s->lock);
-
-               prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-               printbuf_indent_add(&i->buf, 2);
-               bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace);
-               printbuf_indent_sub(&i->buf, 2);
-#endif
-
-               prt_printf(&i->buf, "Transaction duration:\n");
-
-               printbuf_indent_add(&i->buf, 2);
-               bch2_time_stats_to_text(&i->buf, &s->duration);
-               printbuf_indent_sub(&i->buf, 2);
-
-               if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
-                       prt_printf(&i->buf, "Lock hold times:\n");
-
-                       printbuf_indent_add(&i->buf, 2);
-                       bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
-                       printbuf_indent_sub(&i->buf, 2);
-               }
-
-               if (s->max_paths_text) {
-                       prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths);
-
-                       printbuf_indent_add(&i->buf, 2);
-                       prt_str_indented(&i->buf, s->max_paths_text);
-                       printbuf_indent_sub(&i->buf, 2);
-               }
-
-               mutex_unlock(&s->lock);
-
-               printbuf_indent_sub(&i->buf, 2);
-               prt_newline(&i->buf);
-               i->iter++;
-       }
-
-       if (i->buf.allocation_failure)
-               return -ENOMEM;
-
-       return i->ret;
-}
-
-static const struct file_operations btree_transaction_stats_op = {
-       .owner          = THIS_MODULE,
-       .open           = btree_transaction_stats_open,
-       .release        = btree_transaction_stats_release,
-       .read           = btree_transaction_stats_read,
-};
-
-/* walk btree transactions until we find a deadlock and print it */
-static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       struct btree_trans *trans;
-       ulong iter = 0;
-restart:
-       seqmutex_lock(&c->btree_trans_lock);
-       list_sort(&c->btree_trans_list, list_ptr_order_cmp);
-
-       list_for_each_entry(trans, &c->btree_trans_list, list) {
-               if ((ulong) trans <= iter)
-                       continue;
-
-               iter = (ulong) trans;
-
-               if (!closure_get_not_zero(&trans->ref))
-                       continue;
-
-               u32 seq = seqmutex_unlock(&c->btree_trans_lock);
-
-               bool found = bch2_check_for_deadlock(trans, out) != 0;
-
-               closure_put(&trans->ref);
-
-               if (found)
-                       return;
-
-               if (!seqmutex_relock(&c->btree_trans_lock, seq))
-                       goto restart;
-       }
-       seqmutex_unlock(&c->btree_trans_lock);
-}
-
-typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *);
-
-static ssize_t bch2_simple_print(struct file *file, char __user *buf,
-                                size_t size, loff_t *ppos,
-                                fs_to_text_fn fn)
-{
-       struct dump_iter *i = file->private_data;
-       struct bch_fs *c = i->c;
-       ssize_t ret = 0;
-
-       i->ubuf = buf;
-       i->size = size;
-       i->ret  = 0;
-
-       if (!i->iter) {
-               fn(&i->buf, c);
-               i->iter++;
-       }
-
-       if (i->buf.allocation_failure)
-               ret = -ENOMEM;
-
-       if (!ret)
-               ret = bch2_debugfs_flush_buf(i);
-
-       return ret ?: i->ret;
-}
-
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
-                                       size_t size, loff_t *ppos)
-{
-       return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text);
-}
-
-static const struct file_operations btree_deadlock_ops = {
-       .owner          = THIS_MODULE,
-       .open           = bch2_dump_open,
-       .release        = bch2_dump_release,
-       .read           = bch2_btree_deadlock_read,
-};
-
-static ssize_t bch2_write_points_read(struct file *file, char __user *buf,
-                                    size_t size, loff_t *ppos)
-{
-       return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text);
-}
-
-static const struct file_operations write_points_ops = {
-       .owner          = THIS_MODULE,
-       .open           = bch2_dump_open,
-       .release        = bch2_dump_release,
-       .read           = bch2_write_points_read,
-};
-
-void bch2_fs_debug_exit(struct bch_fs *c)
-{
-       if (!IS_ERR_OR_NULL(c->fs_debug_dir))
-               debugfs_remove_recursive(c->fs_debug_dir);
-}
-
-static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
-{
-       struct dentry *d;
-
-       d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
-
-       debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
-
-       debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
-
-       debugfs_create_file("bfloat-failed", 0400, d, bd,
-                           &bfloat_failed_debug_ops);
-}
-
-void bch2_fs_debug_init(struct bch_fs *c)
-{
-       struct btree_debug *bd;
-       char name[100];
-
-       if (IS_ERR_OR_NULL(bch_debug))
-               return;
-
-       if (c->sb.multi_device)
-               snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
-       else
-               strscpy(name, c->name, sizeof(name));
-
-       c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
-       if (IS_ERR_OR_NULL(c->fs_debug_dir))
-               return;
-
-       debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
-                           c->btree_debug, &cached_btree_nodes_ops);
-
-       debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
-                           c->btree_debug, &btree_transactions_ops);
-
-       debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
-                           c->btree_debug, &journal_pins_ops);
-
-       debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
-                           c->btree_debug, &btree_updates_ops);
-
-       debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
-                           c, &btree_transaction_stats_op);
-
-       debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
-                           c->btree_debug, &btree_deadlock_ops);
-
-       debugfs_create_file("write_points", 0400, c->fs_debug_dir,
-                           c->btree_debug, &write_points_ops);
-
-       bch2_fs_async_obj_debugfs_init(c);
-
-       c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
-       if (IS_ERR_OR_NULL(c->btree_debug_dir))
-               return;
-
-       for (bd = c->btree_debug;
-            bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
-            bd++) {
-               bd->id = bd - c->btree_debug;
-               bch2_fs_debug_btree_init(c, bd);
-       }
-}
-
-#endif
-
-void bch2_debug_exit(void)
-{
-       if (!IS_ERR_OR_NULL(bch_debug))
-               debugfs_remove_recursive(bch_debug);
-}
-
-int __init bch2_debug_init(void)
-{
-       bch_debug = debugfs_create_dir("bcachefs", NULL);
-       return 0;
-}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
deleted file mode 100644 (file)
index d88b119..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DEBUG_H
-#define _BCACHEFS_DEBUG_H
-
-#include "bcachefs.h"
-
-struct bio;
-struct btree;
-struct bch_fs;
-
-void __bch2_btree_verify(struct bch_fs *, struct btree *);
-void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
-                                   const struct btree *);
-
-static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
-{
-       if (static_branch_unlikely(&bch2_verify_btree_ondisk))
-               __bch2_btree_verify(c, b);
-}
-
-#ifdef CONFIG_DEBUG_FS
-struct dump_iter {
-       struct bch_fs           *c;
-       struct async_obj_list   *list;
-       enum btree_id           id;
-       struct bpos             from;
-       struct bpos             prev_node;
-       u64                     iter;
-
-       struct printbuf         buf;
-
-       char __user             *ubuf;  /* destination user buffer */
-       size_t                  size;   /* size of requested read */
-       ssize_t                 ret;    /* bytes read so far */
-};
-
-ssize_t bch2_debugfs_flush_buf(struct dump_iter *);
-int bch2_dump_release(struct inode *, struct file *);
-
-void bch2_fs_debug_exit(struct bch_fs *);
-void bch2_fs_debug_init(struct bch_fs *);
-#else
-static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
-static inline void bch2_fs_debug_init(struct bch_fs *c) {}
-#endif
-
-void bch2_debug_exit(void);
-int bch2_debug_init(void);
-
-#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
deleted file mode 100644 (file)
index 28875c5..0000000
+++ /dev/null
@@ -1,766 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "dirent.h"
-#include "fs.h"
-#include "keylist.h"
-#include "str_hash.h"
-#include "subvolume.h"
-
-#include <linux/dcache.h>
-
-#ifdef CONFIG_UNICODE
-int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
-                 const struct qstr *str, struct qstr *out_cf)
-{
-       *out_cf = (struct qstr) QSTR_INIT(NULL, 0);
-
-       if (!bch2_fs_casefold_enabled(trans->c))
-               return -EOPNOTSUPP;
-
-       unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
-       int ret = PTR_ERR_OR_ZERO(buf);
-       if (ret)
-               return ret;
-
-       ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1);
-       if (ret <= 0)
-               return ret;
-
-       *out_cf = (struct qstr) QSTR_INIT(buf, ret);
-       return 0;
-}
-#endif
-
-static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
-{
-       if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
-               return 0;
-
-       unsigned bkey_u64s = bkey_val_u64s(d.k);
-       unsigned bkey_bytes = bkey_u64s * sizeof(u64);
-       u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
-#if CPU_BIG_ENDIAN
-       unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
-#else
-       unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
-#endif
-
-       return bkey_bytes -
-               (d.v->d_casefold
-               ? offsetof(struct bch_dirent, d_cf_name_block.d_names)
-               : offsetof(struct bch_dirent, d_name)) -
-               trailing_nuls;
-}
-
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
-{
-       if (d.v->d_casefold) {
-               unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
-               return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len);
-       } else {
-               return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
-       }
-}
-
-static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d)
-{
-       if (d.v->d_casefold) {
-               unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
-               unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len);
-               return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len);
-       } else {
-               return (struct qstr) QSTR_INIT(NULL, 0);
-       }
-}
-
-static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d)
-{
-       return d.v->d_casefold
-               ? bch2_dirent_get_casefold_name(d)
-               : bch2_dirent_get_name(d);
-}
-
-static u64 bch2_dirent_hash(const struct bch_hash_info *info,
-                           const struct qstr *name)
-{
-       struct bch_str_hash_ctx ctx;
-
-       bch2_str_hash_init(&ctx, info);
-       bch2_str_hash_update(&ctx, info, name->name, name->len);
-
-       /* [0,2) reserved for dots */
-       return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
-}
-
-static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
-{
-       return bch2_dirent_hash(info, key);
-}
-
-static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
-       struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-       struct qstr name = bch2_dirent_get_lookup_name(d);
-
-       return bch2_dirent_hash(info, &name);
-}
-
-static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
-{
-       struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
-       const struct qstr l_name = bch2_dirent_get_lookup_name(l);
-       const struct qstr *r_name = _r;
-
-       return !qstr_eq(l_name, *r_name);
-}
-
-static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
-       struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
-       struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
-       const struct qstr l_name = bch2_dirent_get_lookup_name(l);
-       const struct qstr r_name = bch2_dirent_get_lookup_name(r);
-
-       return !qstr_eq(l_name, r_name);
-}
-
-static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
-{
-       struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-       if (d.v->d_type == DT_SUBVOL)
-               return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
-       return true;
-}
-
-const struct bch_hash_desc bch2_dirent_hash_desc = {
-       .btree_id       = BTREE_ID_dirents,
-       .key_type       = KEY_TYPE_dirent,
-       .hash_key       = dirent_hash_key,
-       .hash_bkey      = dirent_hash_bkey,
-       .cmp_key        = dirent_cmp_key,
-       .cmp_bkey       = dirent_cmp_bkey,
-       .is_visible     = dirent_is_visible,
-};
-
-int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
-                        struct bkey_validate_context from)
-{
-       struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-       unsigned name_block_len = bch2_dirent_name_bytes(d);
-       struct qstr d_name = bch2_dirent_get_name(d);
-       struct qstr d_cf_name = bch2_dirent_get_casefold_name(d);
-       int ret = 0;
-
-       bkey_fsck_err_on(!d_name.len,
-                        c, dirent_empty_name,
-                        "empty name");
-
-       bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len,
-                        c, dirent_val_too_big,
-                        "dirent names exceed bkey size (%d + %d > %d)",
-                        d_name.len, d_cf_name.len, name_block_len);
-
-       /*
-        * Check new keys don't exceed the max length
-        * (older keys may be larger.)
-        */
-       bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
-                        c, dirent_name_too_long,
-                        "dirent name too big (%u > %u)",
-                        d_name.len, BCH_NAME_MAX);
-
-       bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len),
-                        c, dirent_name_embedded_nul,
-                        "dirent has stray data after name's NUL");
-
-       bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
-                        (d_name.len == 2 && !memcmp(d_name.name, "..", 2)),
-                        c, dirent_name_dot_or_dotdot,
-                        "invalid name");
-
-       bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len),
-                        c, dirent_name_has_slash,
-                        "name with /");
-
-       bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
-                        le64_to_cpu(d.v->d_inum) == d.k->p.inode,
-                        c, dirent_to_itself,
-                        "dirent points to own directory");
-
-       if (d.v->d_casefold) {
-               bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit &&
-                                d_cf_name.len > BCH_NAME_MAX,
-                                c, dirent_cf_name_too_big,
-                                "dirent w/ cf name too big (%u > %u)",
-                                d_cf_name.len, BCH_NAME_MAX);
-
-               bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len),
-                                c, dirent_stray_data_after_cf_name,
-                                "dirent has stray data after cf name's NUL");
-       }
-fsck_err:
-       return ret;
-}
-
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-       struct qstr d_name = bch2_dirent_get_name(d);
-
-       prt_printf(out, "%.*s", d_name.len, d_name.name);
-
-       if (d.v->d_casefold) {
-               struct qstr d_name = bch2_dirent_get_lookup_name(d);
-               prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name);
-       }
-
-       prt_str(out, " ->");
-
-       if (d.v->d_type != DT_SUBVOL)
-               prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum));
-       else
-               prt_printf(out, " %u -> %u",
-                          le32_to_cpu(d.v->d_parent_subvol),
-                          le32_to_cpu(d.v->d_child_subvol));
-
-       prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
-}
-
-int bch2_dirent_init_name(struct bch_fs *c,
-                         struct bkey_i_dirent *dirent,
-                         const struct bch_hash_info *hash_info,
-                         const struct qstr *name,
-                         const struct qstr *cf_name)
-{
-       EBUG_ON(hash_info->cf_encoding == NULL && cf_name);
-       int cf_len = 0;
-
-       if (name->len > BCH_NAME_MAX)
-               return -ENAMETOOLONG;
-
-       dirent->v.d_casefold = hash_info->cf_encoding != NULL;
-
-       if (!dirent->v.d_casefold) {
-               memcpy(&dirent->v.d_name[0], name->name, name->len);
-               memset(&dirent->v.d_name[name->len], 0,
-                      bkey_val_bytes(&dirent->k) -
-                      offsetof(struct bch_dirent, d_name) -
-                      name->len);
-       } else {
-               if (!bch2_fs_casefold_enabled(c))
-                       return -EOPNOTSUPP;
-
-#ifdef CONFIG_UNICODE
-               memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
-
-               char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len];
-
-               if (cf_name) {
-                       cf_len = cf_name->len;
-
-                       memcpy(cf_out, cf_name->name, cf_name->len);
-               } else {
-                       cf_len = utf8_casefold(hash_info->cf_encoding, name,
-                                              cf_out,
-                                              bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out);
-                       if (cf_len <= 0)
-                               return cf_len;
-               }
-
-               memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0,
-                      bkey_val_bytes(&dirent->k) -
-                      offsetof(struct bch_dirent, d_cf_name_block.d_names) -
-                      name->len + cf_len);
-
-               dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
-               dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len);
-
-               EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len);
-#endif
-       }
-
-       unsigned u64s = dirent_val_u64s(name->len, cf_len);
-       BUG_ON(u64s > bkey_val_u64s(&dirent->k));
-       set_bkey_val_u64s(&dirent->k, u64s);
-       return 0;
-}
-
-struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans,
-                               const struct bch_hash_info *hash_info,
-                               subvol_inum dir,
-                               u8 type,
-                               const struct qstr *name,
-                               const struct qstr *cf_name,
-                               u64 dst)
-{
-       struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
-       if (IS_ERR(dirent))
-               return dirent;
-
-       bkey_dirent_init(&dirent->k_i);
-       dirent->k.u64s = BKEY_U64s_MAX;
-
-       if (type != DT_SUBVOL) {
-               dirent->v.d_inum = cpu_to_le64(dst);
-       } else {
-               dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
-               dirent->v.d_child_subvol = cpu_to_le32(dst);
-       }
-
-       dirent->v.d_type = type;
-       dirent->v.d_unused = 0;
-
-       int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name);
-       if (ret)
-               return ERR_PTR(ret);
-
-       EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
-       return dirent;
-}
-
-int bch2_dirent_create_snapshot(struct btree_trans *trans,
-                       u32 dir_subvol, u64 dir, u32 snapshot,
-                       const struct bch_hash_info *hash_info,
-                       u8 type, const struct qstr *name, u64 dst_inum,
-                       u64 *dir_offset,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
-       struct bkey_i_dirent *dirent;
-       int ret;
-
-       dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
-       ret = PTR_ERR_OR_ZERO(dirent);
-       if (ret)
-               return ret;
-
-       dirent->k.p.inode       = dir;
-       dirent->k.p.snapshot    = snapshot;
-
-       ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-                                       dir_inum, snapshot, &dirent->k_i, flags);
-       *dir_offset = dirent->k.p.offset;
-
-       return ret;
-}
-
-int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
-                      const struct bch_hash_info *hash_info,
-                      u8 type, const struct qstr *name, u64 dst_inum,
-                      u64 *dir_offset,
-                      enum btree_iter_update_trigger_flags flags)
-{
-       struct bkey_i_dirent *dirent;
-       int ret;
-
-       dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
-       ret = PTR_ERR_OR_ZERO(dirent);
-       if (ret)
-               return ret;
-
-       ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-                           dir, &dirent->k_i, flags);
-       *dir_offset = dirent->k.p.offset;
-
-       return ret;
-}
-
-int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
-                           struct bkey_s_c_dirent d, subvol_inum *target)
-{
-       struct bch_subvolume s;
-       int ret = 0;
-
-       if (d.v->d_type == DT_SUBVOL &&
-           le32_to_cpu(d.v->d_parent_subvol) != dir.subvol)
-               return 1;
-
-       if (likely(d.v->d_type != DT_SUBVOL)) {
-               target->subvol  = dir.subvol;
-               target->inum    = le64_to_cpu(d.v->d_inum);
-       } else {
-               target->subvol  = le32_to_cpu(d.v->d_child_subvol);
-
-               ret = bch2_subvolume_get(trans, target->subvol, true, &s);
-
-               target->inum    = le64_to_cpu(s.inode);
-       }
-
-       return ret;
-}
-
-int bch2_dirent_rename(struct btree_trans *trans,
-               subvol_inum src_dir, struct bch_hash_info *src_hash,
-               subvol_inum dst_dir, struct bch_hash_info *dst_hash,
-               const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
-               const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
-               enum bch_rename_mode mode)
-{
-       struct qstr src_name_lookup, dst_name_lookup;
-       struct btree_iter src_iter = {};
-       struct btree_iter dst_iter = {};
-       struct bkey_s_c old_src, old_dst = bkey_s_c_null;
-       struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
-       struct bpos dst_pos =
-               POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
-       unsigned src_update_flags = 0;
-       bool delete_src, delete_dst;
-       int ret = 0;
-
-       memset(src_inum, 0, sizeof(*src_inum));
-       memset(dst_inum, 0, sizeof(*dst_inum));
-
-       /* Lookup src: */
-       ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup);
-       if (ret)
-               goto out;
-       old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
-                                  src_hash, src_dir, &src_name_lookup,
-                                  BTREE_ITER_intent);
-       ret = bkey_err(old_src);
-       if (ret)
-               goto out;
-
-       ret = bch2_dirent_read_target(trans, src_dir,
-                       bkey_s_c_to_dirent(old_src), src_inum);
-       if (ret)
-               goto out;
-
-       /* Lookup dst: */
-       ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup);
-       if (ret)
-               goto out;
-       if (mode == BCH_RENAME) {
-               /*
-                * Note that we're _not_ checking if the target already exists -
-                * we're relying on the VFS to do that check for us for
-                * correctness:
-                */
-               ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
-                                    dst_hash, dst_dir, &dst_name_lookup);
-               if (ret)
-                       goto out;
-       } else {
-               old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
-                                           dst_hash, dst_dir, &dst_name_lookup,
-                                           BTREE_ITER_intent);
-               ret = bkey_err(old_dst);
-               if (ret)
-                       goto out;
-
-               ret = bch2_dirent_read_target(trans, dst_dir,
-                               bkey_s_c_to_dirent(old_dst), dst_inum);
-               if (ret)
-                       goto out;
-       }
-
-       if (mode != BCH_RENAME_EXCHANGE)
-               *src_offset = dst_iter.pos.offset;
-
-       /* Create new dst key: */
-       new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
-                                        dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
-       ret = PTR_ERR_OR_ZERO(new_dst);
-       if (ret)
-               goto out;
-
-       dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-       new_dst->k.p = dst_iter.pos;
-
-       /* Create new src key: */
-       if (mode == BCH_RENAME_EXCHANGE) {
-               new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name,
-                                                src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
-               ret = PTR_ERR_OR_ZERO(new_src);
-               if (ret)
-                       goto out;
-
-               dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
-               new_src->k.p = src_iter.pos;
-       } else {
-               new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-               ret = PTR_ERR_OR_ZERO(new_src);
-               if (ret)
-                       goto out;
-
-               bkey_init(&new_src->k);
-               new_src->k.p = src_iter.pos;
-
-               if (bkey_le(dst_pos, src_iter.pos) &&
-                   bkey_lt(src_iter.pos, dst_iter.pos)) {
-                       /*
-                        * We have a hash collision for the new dst key,
-                        * and new_src - the key we're deleting - is between
-                        * new_dst's hashed slot and the slot we're going to be
-                        * inserting it into - oops.  This will break the hash
-                        * table if we don't deal with it:
-                        */
-                       if (mode == BCH_RENAME) {
-                               /*
-                                * If we're not overwriting, we can just insert
-                                * new_dst at the src position:
-                                */
-                               new_src = new_dst;
-                               new_src->k.p = src_iter.pos;
-                               goto out_set_src;
-                       } else {
-                               /* If we're overwriting, we can't insert new_dst
-                                * at a different slot because it has to
-                                * overwrite old_dst - just make sure to use a
-                                * whiteout when deleting src:
-                                */
-                               new_src->k.type = KEY_TYPE_hash_whiteout;
-                       }
-               } else {
-                       /* Check if we need a whiteout to delete src: */
-                       ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
-                                                      src_hash, &src_iter);
-                       if (ret < 0)
-                               goto out;
-
-                       if (ret)
-                               new_src->k.type = KEY_TYPE_hash_whiteout;
-               }
-       }
-
-       if (new_dst->v.d_type == DT_SUBVOL)
-               new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
-
-       if ((mode == BCH_RENAME_EXCHANGE) &&
-           new_src->v.d_type == DT_SUBVOL)
-               new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
-
-       ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
-       if (ret)
-               goto out;
-out_set_src:
-       /*
-        * If we're deleting a subvolume we need to really delete the dirent,
-        * not just emit a whiteout in the current snapshot - there can only be
-        * single dirent that points to a given subvolume.
-        *
-        * IOW, we don't maintain multiple versions in different snapshots of
-        * dirents that point to subvolumes - dirents that point to subvolumes
-        * are only visible in one particular subvolume so it's not necessary,
-        * and it would be particularly confusing for fsck to have to deal with.
-        */
-       delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
-               new_src->k.p.snapshot != old_src.k->p.snapshot;
-
-       delete_dst = old_dst.k &&
-               bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
-               new_dst->k.p.snapshot != old_dst.k->p.snapshot;
-
-       if (!delete_src || !bkey_deleted(&new_src->k)) {
-               ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
-               if (ret)
-                       goto out;
-       }
-
-       if (delete_src) {
-               bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot);
-               ret =   bch2_btree_iter_traverse(trans, &src_iter) ?:
-                       bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
-               if (ret)
-                       goto out;
-       }
-
-       if (delete_dst) {
-               bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot);
-               ret =   bch2_btree_iter_traverse(trans, &dst_iter) ?:
-                       bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
-               if (ret)
-                       goto out;
-       }
-
-       if (mode == BCH_RENAME_EXCHANGE)
-               *src_offset = new_src->k.p.offset;
-       *dst_offset = new_dst->k.p.offset;
-out:
-       bch2_trans_iter_exit(trans, &src_iter);
-       bch2_trans_iter_exit(trans, &dst_iter);
-       return ret;
-}
-
-int bch2_dirent_lookup_trans(struct btree_trans *trans,
-                            struct btree_iter *iter,
-                            subvol_inum dir,
-                            const struct bch_hash_info *hash_info,
-                            const struct qstr *name, subvol_inum *inum,
-                            unsigned flags)
-{
-       struct qstr lookup_name;
-       int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name);
-       if (ret)
-               return ret;
-
-       struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
-                                            hash_info, dir, &lookup_name, flags);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
-       if (ret > 0)
-               ret = -ENOENT;
-err:
-       if (ret)
-               bch2_trans_iter_exit(trans, iter);
-       return ret;
-}
-
-u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
-                      const struct bch_hash_info *hash_info,
-                      const struct qstr *name, subvol_inum *inum)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter = {};
-
-       int ret = lockrestart_do(trans,
-               bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-       return ret;
-}
-
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
-                          SPOS(dir, 0, snapshot),
-                          POS(dir, U64_MAX), 0, k, ret)
-               if (k.k->type == KEY_TYPE_dirent) {
-                       struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-                       if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
-                               continue;
-                       ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty);
-                       break;
-               }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
-{
-       u32 snapshot;
-
-       return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
-               bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
-}
-
-static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
-{
-       struct qstr name = bch2_dirent_get_name(d);
-       /*
-        * Although not required by the kernel code, updating ctx->pos is needed
-        * for the bcachefs FUSE driver. Without this update, the FUSE
-        * implementation will be stuck in an infinite loop when reading
-        * directories (via the bcachefs_fuse_readdir callback).
-        * In kernel space, ctx->pos is updated by the VFS code.
-        */
-       ctx->pos = d.k->p.offset;
-       bool ret = dir_emit(ctx, name.name,
-                     name.len,
-                     target.inum,
-                     vfs_d_type(d.v->d_type));
-       if (ret)
-               ctx->pos = d.k->p.offset + 1;
-       return !ret;
-}
-
-int bch2_readdir(struct bch_fs *c, subvol_inum inum,
-                struct bch_hash_info *hash_info,
-                struct dir_context *ctx)
-{
-       struct bkey_buf sk;
-       bch2_bkey_buf_init(&sk);
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents,
-                                  POS(inum.inum, ctx->pos),
-                                  POS(inum.inum, U64_MAX),
-                                  inum.subvol, 0, k, ({
-                       if (k.k->type != KEY_TYPE_dirent)
-                               continue;
-
-                       /* dir_emit() can fault and block: */
-                       bch2_bkey_buf_reassemble(&sk, c, k);
-                       struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
-
-                       subvol_inum target;
-
-                       bool need_second_pass = false;
-                       int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc,
-                                                          hash_info, &iter, k, &need_second_pass) ?:
-                               bch2_dirent_read_target(trans, inum, dirent, &target);
-                       if (ret2 > 0)
-                               continue;
-
-                       ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target));
-               })));
-
-       bch2_bkey_buf_exit(&sk, c);
-
-       return ret < 0 ? ret : 0;
-}
-
-/* fsck */
-
-static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
-                             struct bch_inode_unpacked *inode)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
-                                    BTREE_ITER_all_snapshots, k, ret) {
-               if (k.k->p.offset != inode_nr)
-                       break;
-               if (!bkey_is_inode(k.k))
-                       continue;
-               ret = bch2_inode_unpack(k, inode);
-               goto found;
-       }
-       ret = bch_err_throw(trans->c, ENOENT_inode);
-found:
-       bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bch_inode_unpacked dir_inode;
-       struct bch_hash_info dir_hash_info;
-       int ret;
-
-       ret = lookup_first_inode(trans, pos.inode, &dir_inode);
-       if (ret)
-               goto err;
-
-       dir_hash_info = bch2_hash_info_init(c, &dir_inode);
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
-
-       ret =   bch2_btree_iter_traverse(trans, &iter) ?:
-               bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-                                   &dir_hash_info, &iter,
-                                   BTREE_UPDATE_internal_snapshot_node);
-       bch2_trans_iter_exit(trans, &iter);
-err:
-       bch_err_fn(c, ret);
-       return ret;
-}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
deleted file mode 100644 (file)
index 0417608..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DIRENT_H
-#define _BCACHEFS_DIRENT_H
-
-#include "str_hash.h"
-
-extern const struct bch_hash_desc bch2_dirent_hash_desc;
-
-int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c,
-                        struct bkey_validate_context);
-void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_dirent ((struct bkey_ops) {      \
-       .key_validate   = bch2_dirent_validate,         \
-       .val_to_text    = bch2_dirent_to_text,          \
-       .min_val_size   = 16,                           \
-})
-
-struct qstr;
-struct file;
-struct dir_context;
-struct bch_fs;
-struct bch_hash_info;
-struct bch_inode_info;
-
-#ifdef CONFIG_UNICODE
-int bch2_casefold(struct btree_trans *, const struct bch_hash_info *,
-                 const struct qstr *, struct qstr *);
-#else
-static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
-                               const struct qstr *str, struct qstr *out_cf)
-{
-       return -EOPNOTSUPP;
-}
-#endif
-
-static inline int bch2_maybe_casefold(struct btree_trans *trans,
-                                     const struct bch_hash_info *info,
-                                     const struct qstr *str, struct qstr *out_cf)
-{
-       if (likely(!info->cf_encoding)) {
-               *out_cf = *str;
-               return 0;
-       } else {
-               return bch2_casefold(trans, info, str, out_cf);
-       }
-}
-
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent);
-
-static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
-{
-       unsigned bytes = cf_len
-               ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len
-               : offsetof(struct bch_dirent, d_name) + len;
-
-       return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
-                           struct bkey_s_c_dirent, subvol_inum *);
-
-static inline void dirent_copy_target(struct bkey_i_dirent *dst,
-                                     struct bkey_s_c_dirent src)
-{
-       dst->v.d_inum = src.v->d_inum;
-       dst->v.d_type = src.v->d_type;
-}
-
-int bch2_dirent_init_name(struct bch_fs *,
-                         struct bkey_i_dirent *,
-                         const struct bch_hash_info *,
-                         const struct qstr *,
-                         const struct qstr *);
-struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *,
-                               const struct bch_hash_info *, subvol_inum, u8,
-                               const struct qstr *, const struct qstr *, u64);
-
-int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
-                       const struct bch_hash_info *, u8,
-                       const struct qstr *, u64, u64 *,
-                       enum btree_iter_update_trigger_flags);
-int bch2_dirent_create(struct btree_trans *, subvol_inum,
-                      const struct bch_hash_info *, u8,
-                      const struct qstr *, u64, u64 *,
-                      enum btree_iter_update_trigger_flags);
-
-static inline unsigned vfs_d_type(unsigned type)
-{
-       return type == DT_SUBVOL ? DT_DIR : type;
-}
-
-enum bch_rename_mode {
-       BCH_RENAME,
-       BCH_RENAME_OVERWRITE,
-       BCH_RENAME_EXCHANGE,
-};
-
-int bch2_dirent_rename(struct btree_trans *,
-                      subvol_inum, struct bch_hash_info *,
-                      subvol_inum, struct bch_hash_info *,
-                      const struct qstr *, subvol_inum *, u64 *,
-                      const struct qstr *, subvol_inum *, u64 *,
-                      enum bch_rename_mode);
-
-int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
-                              subvol_inum, const struct bch_hash_info *,
-                              const struct qstr *, subvol_inum *, unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
-                      const struct bch_hash_info *,
-                      const struct qstr *, subvol_inum *);
-
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
-int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
-int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *);
-
-int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
-
-#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
deleted file mode 100644 (file)
index a46dbdd..0000000
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DIRENT_FORMAT_H
-#define _BCACHEFS_DIRENT_FORMAT_H
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
-       struct bch_val          v;
-
-       /* Target inode number: */
-       union {
-       __le64                  d_inum;
-       struct {                /* DT_SUBVOL */
-       __le32                  d_child_subvol;
-       __le32                  d_parent_subvol;
-       };
-       };
-
-       /*
-        * Copy of mode bits 12-15 from the target inode - so userspace can get
-        * the filetype without having to do a stat()
-        */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u8                    d_type:5,
-                               d_unused:2,
-                               d_casefold:1;
-#elif defined(__BIG_ENDIAN_BITFIELD)
-       __u8                    d_casefold:1,
-                               d_unused:2,
-                               d_type:5;
-#endif
-
-       union {
-       struct {
-               __u8            d_pad;
-               __le16          d_name_len;
-               __le16          d_cf_name_len;
-               __u8            d_names[];
-       } d_cf_name_block __packed;
-       __DECLARE_FLEX_ARRAY(__u8, d_name);
-       } __packed;
-} __packed __aligned(8);
-
-#define DT_SUBVOL      16
-#define BCH_DT_MAX     17
-
-#define BCH_NAME_MAX   512
-
-#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
deleted file mode 100644 (file)
index f7528cd..0000000
+++ /dev/null
@@ -1,1074 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "btree_cache.h"
-#include "btree_journal_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "compress.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "journal_io.h"
-#include "replicas.h"
-
-/*
- * Notes on disk accounting:
- *
- * We have two parallel sets of counters to be concerned with, and both must be
- * kept in sync.
- *
- *  - Persistent/on disk accounting, stored in the accounting btree and updated
- *    via btree write buffer updates that treat new accounting keys as deltas to
- *    apply to existing values. But reading from a write buffer btree is
- *    expensive, so we also have
- *
- *  - In memory accounting, where accounting is stored as an array of percpu
- *    counters, indexed by an eytzinger array of disk acounting keys/bpos (which
- *    are the same thing, excepting byte swabbing on big endian).
- *
- *    Cheap to read, but non persistent.
- *
- * Disk accounting updates are generated by transactional triggers; these run as
- * keys enter and leave the btree, and can compare old and new versions of keys;
- * the output of these triggers are deltas to the various counters.
- *
- * Disk accounting updates are done as btree write buffer updates, where the
- * counters in the disk accounting key are deltas that will be applied to the
- * counter in the btree when the key is flushed by the write buffer (or journal
- * replay).
- *
- * To do a disk accounting update:
- * - initialize a disk_accounting_pos, to specify which counter is being update
- * - initialize counter deltas, as an array of 1-3 s64s
- * - call bch2_disk_accounting_mod()
- *
- * This queues up the accounting update to be done at transaction commit time.
- * Underneath, it's a normal btree write buffer update.
- *
- * The transaction commit path is responsible for propagating updates to the in
- * memory counters, with bch2_accounting_mem_mod().
- *
- * The commit path also assigns every disk accounting update a unique version
- * number, based on the journal sequence number and offset within that journal
- * buffer; this is used by journal replay to determine which updates have been
- * done.
- *
- * The transaction commit path also ensures that replicas entry accounting
- * updates are properly marked in the superblock (so that we know whether we can
- * mount without data being unavailable); it will update the superblock if
- * bch2_accounting_mem_mod() tells it to.
- */
-
-static const char * const disk_accounting_type_strs[] = {
-#define x(t, n, ...) [n] = #t,
-       BCH_DISK_ACCOUNTING_TYPES()
-#undef x
-       NULL
-};
-
-static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos,
-                                        s64 *d, unsigned nr)
-{
-       struct bkey_i_accounting *acc = bkey_accounting_init(k);
-
-       acc->k.p = pos;
-       set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
-
-       memcpy_u64s_small(acc->v.d, d, nr);
-}
-
-static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
-                                      s64 *d, unsigned nr)
-{
-       return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr);
-}
-
-static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
-
-int bch2_disk_accounting_mod(struct btree_trans *trans,
-                            struct disk_accounting_pos *k,
-                            s64 *d, unsigned nr, bool gc)
-{
-       BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
-
-       /* Normalize: */
-       switch (k->type) {
-       case BCH_DISK_ACCOUNTING_replicas:
-               bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp);
-               break;
-       }
-
-       struct bpos pos = disk_accounting_pos_to_bpos(k);
-
-       if (likely(!gc)) {
-               struct bkey_i_accounting *a;
-#if 0
-               for (a = btree_trans_subbuf_base(trans, &trans->accounting);
-                    a != btree_trans_subbuf_top(trans, &trans->accounting);
-                    a = (void *) bkey_next(&a->k_i))
-                       if (bpos_eq(a->k.p, pos)) {
-                               BUG_ON(nr != bch2_accounting_counters(&a->k));
-                               acc_u64s(a->v.d, d, nr);
-
-                               if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) {
-                                       unsigned offset = (u64 *) a -
-                                               (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
-
-                                       trans->accounting.u64s -= a->k.u64s;
-                                       memmove_u64s_down(a,
-                                                         bkey_next(&a->k_i),
-                                                         trans->accounting.u64s - offset);
-                               }
-                               return 0;
-                       }
-#endif
-               unsigned u64s = sizeof(*a) / sizeof(u64) + nr;
-               a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s);
-               int ret = PTR_ERR_OR_ZERO(a);
-               if (ret)
-                       return ret;
-
-               __accounting_key_init(&a->k_i, pos, d, nr);
-               return 0;
-       } else {
-               struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
-
-               __accounting_key_init(&k_i.k, pos, d, nr);
-
-               int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
-               if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
-                       ret = drop_locks_do(trans,
-                               bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
-                               bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
-               return ret;
-       }
-}
-
-int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
-                               unsigned dev, s64 sectors,
-                               bool gc)
-{
-       struct disk_accounting_pos acc;
-       memset(&acc, 0, sizeof(acc));
-       acc.type = BCH_DISK_ACCOUNTING_replicas;
-       bch2_replicas_entry_cached(&acc.replicas, dev);
-
-       return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
-}
-
-static inline bool is_zero(char *start, char *end)
-{
-       BUG_ON(start > end);
-
-       for (; start < end; start++)
-               if (*start)
-                       return false;
-       return true;
-}
-
-#define field_end(p, member)   (((void *) (&p.member)) + sizeof(p.member))
-
-static const unsigned bch2_accounting_type_nr_counters[] = {
-#define x(f, id, nr)   [BCH_DISK_ACCOUNTING_##f]       = nr,
-       BCH_DISK_ACCOUNTING_TYPES()
-#undef x
-};
-
-int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
-                            struct bkey_validate_context from)
-{
-       struct disk_accounting_pos acc_k;
-       bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-       void *end = &acc_k + 1;
-       int ret = 0;
-
-       bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) &&
-                        bversion_zero(k.k->bversion),
-                        c, accounting_key_version_0,
-                        "accounting key with version=0");
-
-       switch (acc_k.type) {
-       case BCH_DISK_ACCOUNTING_nr_inodes:
-               end = field_end(acc_k, nr_inodes);
-               break;
-       case BCH_DISK_ACCOUNTING_persistent_reserved:
-               end = field_end(acc_k, persistent_reserved);
-               break;
-       case BCH_DISK_ACCOUNTING_replicas:
-               bkey_fsck_err_on(!acc_k.replicas.nr_devs,
-                                c, accounting_key_replicas_nr_devs_0,
-                                "accounting key replicas entry with nr_devs=0");
-
-               bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs ||
-                                (acc_k.replicas.nr_required > 1 &&
-                                 acc_k.replicas.nr_required == acc_k.replicas.nr_devs),
-                                c, accounting_key_replicas_nr_required_bad,
-                                "accounting key replicas entry with bad nr_required");
-
-               for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
-                       bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1],
-                                        c, accounting_key_replicas_devs_unsorted,
-                                        "accounting key replicas entry with unsorted devs");
-
-               end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas);
-               break;
-       case BCH_DISK_ACCOUNTING_dev_data_type:
-               end = field_end(acc_k, dev_data_type);
-               break;
-       case BCH_DISK_ACCOUNTING_compression:
-               end = field_end(acc_k, compression);
-               break;
-       case BCH_DISK_ACCOUNTING_snapshot:
-               end = field_end(acc_k, snapshot);
-               break;
-       case BCH_DISK_ACCOUNTING_btree:
-               end = field_end(acc_k, btree);
-               break;
-       case BCH_DISK_ACCOUNTING_rebalance_work:
-               end = field_end(acc_k, rebalance_work);
-               break;
-       }
-
-       bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
-                        c, accounting_key_junk_at_end,
-                        "junk at end of accounting key");
-
-       bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type],
-                        c, accounting_key_nr_counters_wrong,
-                        "accounting key with %u counters, should be %u",
-                        bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]);
-fsck_err:
-       return ret;
-}
-
-void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
-{
-       if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
-               prt_printf(out, "unknown type %u", k->type);
-               return;
-       }
-
-       prt_str(out, disk_accounting_type_strs[k->type]);
-       prt_str(out, " ");
-
-       switch (k->type) {
-       case BCH_DISK_ACCOUNTING_nr_inodes:
-               break;
-       case BCH_DISK_ACCOUNTING_persistent_reserved:
-               prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
-               break;
-       case BCH_DISK_ACCOUNTING_replicas:
-               bch2_replicas_entry_to_text(out, &k->replicas);
-               break;
-       case BCH_DISK_ACCOUNTING_dev_data_type:
-               prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
-               bch2_prt_data_type(out, k->dev_data_type.data_type);
-               break;
-       case BCH_DISK_ACCOUNTING_compression:
-               bch2_prt_compression_type(out, k->compression.type);
-               break;
-       case BCH_DISK_ACCOUNTING_snapshot:
-               prt_printf(out, "id=%u", k->snapshot.id);
-               break;
-       case BCH_DISK_ACCOUNTING_btree:
-               prt_str(out, "btree=");
-               bch2_btree_id_to_text(out, k->btree.id);
-               break;
-       }
-}
-
-void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
-       struct disk_accounting_pos acc_k;
-       bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
-       bch2_accounting_key_to_text(out, &acc_k);
-
-       for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
-               prt_printf(out, " %lli", acc.v->d[i]);
-}
-
-void bch2_accounting_swab(struct bkey_s k)
-{
-       for (u64 *p = (u64 *) k.v;
-            p < (u64 *) bkey_val_end(k);
-            p++)
-               *p = swab64(*p);
-}
-
-static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
-                                           struct disk_accounting_pos *acc)
-{
-       unsafe_memcpy(r, &acc->replicas,
-                     replicas_entry_bytes(&acc->replicas),
-                     "variable length struct");
-}
-
-static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
-{
-       struct disk_accounting_pos acc_k;
-       bpos_to_disk_accounting_pos(&acc_k, p);
-
-       switch (acc_k.type) {
-       case BCH_DISK_ACCOUNTING_replicas:
-               __accounting_to_replicas(r, &acc_k);
-               return true;
-       default:
-               return false;
-       }
-}
-
-static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
-{
-       union bch_replicas_padded r;
-       return accounting_to_replicas(&r.e, p)
-               ? bch2_mark_replicas(c, &r.e)
-               : 0;
-}
-
-/*
- * Ensure accounting keys being updated are present in the superblock, when
- * applicable (i.e. replicas updates)
- */
-int bch2_accounting_update_sb(struct btree_trans *trans)
-{
-       for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
-            i != btree_trans_subbuf_top(trans, &trans->accounting);
-            i = bkey_next(i)) {
-               int ret = bch2_accounting_update_sb_one(trans->c, i->k.p);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a)
-{
-       struct bch_accounting_mem *acc = &c->accounting;
-
-       /* raced with another insert, already present: */
-       if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-                           accounting_pos_cmp, &a.k->p) < acc->k.nr)
-               return 0;
-
-       struct accounting_mem_entry n = {
-               .pos            = a.k->p,
-               .bversion       = a.k->bversion,
-               .nr_counters    = bch2_accounting_counters(a.k),
-               .v[0]           = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
-                                                    sizeof(u64), GFP_KERNEL),
-       };
-
-       if (!n.v[0])
-               goto err;
-
-       if (acc->gc_running) {
-               n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
-                                           sizeof(u64), GFP_KERNEL);
-               if (!n.v[1])
-                       goto err;
-       }
-
-       if (darray_push(&acc->k, n))
-               goto err;
-
-       eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-                       accounting_pos_cmp, NULL);
-
-       if (trace_accounting_mem_insert_enabled()) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_accounting_to_text(&buf, c, a.s_c);
-               trace_accounting_mem_insert(c, buf.buf);
-               printbuf_exit(&buf);
-       }
-       return 0;
-err:
-       free_percpu(n.v[1]);
-       free_percpu(n.v[0]);
-       return bch_err_throw(c, ENOMEM_disk_accounting);
-}
-
-int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
-                              enum bch_accounting_mode mode)
-{
-       union bch_replicas_padded r;
-
-       if (mode != BCH_ACCOUNTING_read &&
-           accounting_to_replicas(&r.e, a.k->p) &&
-           !bch2_replicas_marked_locked(c, &r.e))
-               return bch_err_throw(c, btree_insert_need_mark_replicas);
-
-       percpu_up_read(&c->mark_lock);
-       percpu_down_write(&c->mark_lock);
-       int ret = __bch2_accounting_mem_insert(c, a);
-       percpu_up_write(&c->mark_lock);
-       percpu_down_read(&c->mark_lock);
-       return ret;
-}
-
-int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a,
-                              enum bch_accounting_mode mode)
-{
-       union bch_replicas_padded r;
-
-       if (mode != BCH_ACCOUNTING_read &&
-           accounting_to_replicas(&r.e, a.k->p) &&
-           !bch2_replicas_marked_locked(c, &r.e))
-               return bch_err_throw(c, btree_insert_need_mark_replicas);
-
-       return __bch2_accounting_mem_insert(c, a);
-}
-
-static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
-{
-       for (unsigned i = 0; i < e->nr_counters; i++)
-               if (percpu_u64_get(e->v[0] + i) ||
-                   (e->v[1] &&
-                    percpu_u64_get(e->v[1] + i)))
-                       return false;
-       return true;
-}
-
-void bch2_accounting_mem_gc(struct bch_fs *c)
-{
-       struct bch_accounting_mem *acc = &c->accounting;
-
-       percpu_down_write(&c->mark_lock);
-       struct accounting_mem_entry *dst = acc->k.data;
-
-       darray_for_each(acc->k, src) {
-               if (accounting_mem_entry_is_zero(src)) {
-                       free_percpu(src->v[0]);
-                       free_percpu(src->v[1]);
-               } else {
-                       *dst++ = *src;
-               }
-       }
-
-       acc->k.nr = dst - acc->k.data;
-       eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-                       accounting_pos_cmp, NULL);
-       percpu_up_write(&c->mark_lock);
-}
-
-/*
- * Read out accounting keys for replicas entries, as an array of
- * bch_replicas_usage entries.
- *
- * Note: this may be deprecated/removed at smoe point in the future and replaced
- * with something more general, it exists to support the ioctl used by the
- * 'bcachefs fs usage' command.
- */
-int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
-{
-       struct bch_accounting_mem *acc = &c->accounting;
-       int ret = 0;
-
-       darray_init(usage);
-
-       percpu_down_read(&c->mark_lock);
-       darray_for_each(acc->k, i) {
-               union {
-                       u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs,
-                                              BCH_BKEY_PTRS_MAX)];
-                       struct bch_replicas_usage r;
-               } u;
-               u.r.r.nr_devs = BCH_BKEY_PTRS_MAX;
-
-               if (!accounting_to_replicas(&u.r.r, i->pos))
-                       continue;
-
-               u64 sectors;
-               bch2_accounting_mem_read_counters(acc, i - acc->k.data, &sectors, 1, false);
-               u.r.sectors = sectors;
-
-               ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
-               if (ret)
-                       break;
-
-               memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r));
-               usage->nr += replicas_usage_bytes(&u.r);
-       }
-       percpu_up_read(&c->mark_lock);
-
-       if (ret)
-               darray_exit(usage);
-       return ret;
-}
-
-int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
-{
-
-       struct bch_accounting_mem *acc = &c->accounting;
-       int ret = 0;
-
-       darray_init(out_buf);
-
-       percpu_down_read(&c->mark_lock);
-       darray_for_each(acc->k, i) {
-               struct disk_accounting_pos a_p;
-               bpos_to_disk_accounting_pos(&a_p, i->pos);
-
-               if (!(accounting_types_mask & BIT(a_p.type)))
-                       continue;
-
-               ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
-                                      sizeof(u64) * i->nr_counters);
-               if (ret)
-                       break;
-
-               struct bkey_i_accounting *a_out =
-                       bkey_accounting_init((void *) &darray_top(*out_buf));
-               set_bkey_val_u64s(&a_out->k, i->nr_counters);
-               a_out->k.p = i->pos;
-               bch2_accounting_mem_read_counters(acc, i - acc->k.data,
-                                                 a_out->v.d, i->nr_counters, false);
-
-               if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out)))
-                       out_buf->nr += bkey_bytes(&a_out->k);
-       }
-
-       percpu_up_read(&c->mark_lock);
-
-       if (ret)
-               darray_exit(out_buf);
-       return ret;
-}
-
-static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
-{
-       darray_for_each(acc->k, e) {
-               free_percpu(e->v[gc]);
-               e->v[gc] = NULL;
-       }
-}
-
-int bch2_gc_accounting_start(struct bch_fs *c)
-{
-       struct bch_accounting_mem *acc = &c->accounting;
-       int ret = 0;
-
-       percpu_down_write(&c->mark_lock);
-       darray_for_each(acc->k, e) {
-               e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64),
-                                            sizeof(u64), GFP_KERNEL);
-               if (!e->v[1]) {
-                       bch2_accounting_free_counters(acc, true);
-                       ret = bch_err_throw(c, ENOMEM_disk_accounting);
-                       break;
-               }
-       }
-
-       acc->gc_running = !ret;
-       percpu_up_write(&c->mark_lock);
-
-       return ret;
-}
-
-int bch2_gc_accounting_done(struct bch_fs *c)
-{
-       struct bch_accounting_mem *acc = &c->accounting;
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct printbuf buf = PRINTBUF;
-       struct bpos pos = POS_MIN;
-       int ret = 0;
-
-       percpu_down_write(&c->mark_lock);
-       while (1) {
-               unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-                                                 accounting_pos_cmp, &pos);
-
-               if (idx >= acc->k.nr)
-                       break;
-
-               struct accounting_mem_entry *e = acc->k.data + idx;
-               pos = bpos_successor(e->pos);
-
-               struct disk_accounting_pos acc_k;
-               bpos_to_disk_accounting_pos(&acc_k, e->pos);
-
-               if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
-                       continue;
-
-               u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
-               u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
-
-               unsigned nr = e->nr_counters;
-               bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false);
-               bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true);
-
-               if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
-                       printbuf_reset(&buf);
-                       prt_str(&buf, "accounting mismatch for ");
-                       bch2_accounting_key_to_text(&buf, &acc_k);
-
-                       prt_str(&buf, ":\n      got");
-                       for (unsigned j = 0; j < nr; j++)
-                               prt_printf(&buf, " %llu", dst_v[j]);
-
-                       prt_str(&buf,  "\nshould be");
-                       for (unsigned j = 0; j < nr; j++)
-                               prt_printf(&buf, " %llu", src_v[j]);
-
-                       for (unsigned j = 0; j < nr; j++)
-                               src_v[j] -= dst_v[j];
-
-                       bch2_trans_unlock_long(trans);
-
-                       if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) {
-                               percpu_up_write(&c->mark_lock);
-                               ret = commit_do(trans, NULL, NULL, 0,
-                                               bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
-                               percpu_down_write(&c->mark_lock);
-                               if (ret)
-                                       goto err;
-
-                               if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
-                                       memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-                                       struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
-
-                                       accounting_key_init(&k_i.k, &acc_k, src_v, nr);
-                                       bch2_accounting_mem_mod_locked(trans,
-                                                               bkey_i_to_s_c_accounting(&k_i.k),
-                                                               BCH_ACCOUNTING_normal, true);
-
-                                       preempt_disable();
-                                       struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
-                                       struct bch_fs_usage_base *src = &trans->fs_usage_delta;
-                                       acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
-                                       preempt_enable();
-                               }
-                       }
-               }
-       }
-err:
-fsck_err:
-       percpu_up_write(&c->mark_lock);
-       printbuf_exit(&buf);
-       bch2_trans_put(trans);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-
-       if (k.k->type != KEY_TYPE_accounting)
-               return 0;
-
-       percpu_down_read(&c->mark_lock);
-       int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
-                                                BCH_ACCOUNTING_read, false);
-       percpu_up_read(&c->mark_lock);
-       return ret;
-}
-
-static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
-                                             struct disk_accounting_pos *acc,
-                                             u64 *v, unsigned nr)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0, invalid_dev = -1;
-
-       switch (acc->type) {
-       case BCH_DISK_ACCOUNTING_replicas: {
-               union bch_replicas_padded r;
-               __accounting_to_replicas(&r.e, acc);
-
-               for (unsigned i = 0; i < r.e.nr_devs; i++)
-                       if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
-                           !bch2_dev_exists(c, r.e.devs[i])) {
-                               invalid_dev = r.e.devs[i];
-                               goto invalid_device;
-                       }
-
-               /*
-                * All replicas entry checks except for invalid device are done
-                * in bch2_accounting_validate
-                */
-               BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
-
-               if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
-                               trans, accounting_replicas_not_marked,
-                               "accounting not marked in superblock replicas\n%s",
-                               (printbuf_reset(&buf),
-                                bch2_accounting_key_to_text(&buf, acc),
-                                buf.buf))) {
-                       /*
-                        * We're not RW yet and still single threaded, dropping
-                        * and retaking lock is ok:
-                        */
-                       percpu_up_write(&c->mark_lock);
-                       ret = bch2_mark_replicas(c, &r.e);
-                       if (ret)
-                               goto fsck_err;
-                       percpu_down_write(&c->mark_lock);
-               }
-               break;
-       }
-
-       case BCH_DISK_ACCOUNTING_dev_data_type:
-               if (!bch2_dev_exists(c, acc->dev_data_type.dev)) {
-                       invalid_dev = acc->dev_data_type.dev;
-                       goto invalid_device;
-               }
-               break;
-       }
-
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-invalid_device:
-       if (fsck_err(trans, accounting_to_invalid_device,
-                    "accounting entry points to invalid device %i\n%s",
-                    invalid_dev,
-                    (printbuf_reset(&buf),
-                     bch2_accounting_key_to_text(&buf, acc),
-                     buf.buf))) {
-               for (unsigned i = 0; i < nr; i++)
-                       v[i] = -v[i];
-
-               ret = commit_do(trans, NULL, NULL, 0,
-                               bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?:
-                       -BCH_ERR_remove_disk_accounting_entry;
-       } else {
-               ret = bch_err_throw(c, remove_disk_accounting_entry);
-       }
-       goto fsck_err;
-}
-
-/*
- * At startup time, initialize the in memory accounting from the btree (and
- * journal)
- */
-int bch2_accounting_read(struct bch_fs *c)
-{
-       struct bch_accounting_mem *acc = &c->accounting;
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct printbuf buf = PRINTBUF;
-
-       /*
-        * We might run more than once if we rewind to start topology repair or
-        * btree node scan - and those might cause us to get different results,
-        * so we can't just skip if we've already run.
-        *
-        * Instead, zero out any accounting we have:
-        */
-       percpu_down_write(&c->mark_lock);
-       darray_for_each(acc->k, e)
-               percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
-       for_each_member_device(c, ca)
-               percpu_memset(ca->usage, 0, sizeof(*ca->usage));
-       percpu_memset(c->usage, 0, sizeof(*c->usage));
-       percpu_up_write(&c->mark_lock);
-
-       struct btree_iter iter;
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
-                            BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
-       iter.flags &= ~BTREE_ITER_with_journal;
-       int ret = for_each_btree_key_continue(trans, iter,
-                               BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
-                       struct bkey u;
-                       struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
-
-                       if (k.k->type != KEY_TYPE_accounting)
-                               continue;
-
-                       struct disk_accounting_pos acc_k;
-                       bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
-                       if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
-                               break;
-
-                       if (!bch2_accounting_is_mem(&acc_k)) {
-                               struct disk_accounting_pos next;
-                               memset(&next, 0, sizeof(next));
-                               next.type = acc_k.type + 1;
-                               bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
-                               continue;
-                       }
-
-                       accounting_read_key(trans, k);
-               }));
-       if (ret)
-               goto err;
-
-       struct journal_keys *keys = &c->journal_keys;
-       struct journal_key *dst = keys->data;
-       move_gap(keys, keys->nr);
-
-       darray_for_each(*keys, i) {
-               if (i->k->k.type == KEY_TYPE_accounting) {
-                       struct disk_accounting_pos acc_k;
-                       bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
-
-                       if (!bch2_accounting_is_mem(&acc_k))
-                               continue;
-
-                       struct bkey_s_c k = bkey_i_to_s_c(i->k);
-                       unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
-                                               sizeof(acc->k.data[0]),
-                                               accounting_pos_cmp, &k.k->p);
-
-                       bool applied = idx < acc->k.nr &&
-                               bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
-
-                       if (applied)
-                               continue;
-
-                       if (i + 1 < &darray_top(*keys) &&
-                           i[1].k->k.type == KEY_TYPE_accounting &&
-                           !journal_key_cmp(i, i + 1)) {
-                               WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
-
-                               i[1].journal_seq = i[0].journal_seq;
-
-                               bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
-                                                          bkey_s_c_to_accounting(k));
-                               continue;
-                       }
-
-                       ret = accounting_read_key(trans, k);
-                       if (ret)
-                               goto err;
-               }
-
-               *dst++ = *i;
-       }
-       keys->gap = keys->nr = dst - keys->data;
-
-       percpu_down_write(&c->mark_lock);
-
-       darray_for_each_reverse(acc->k, i) {
-               struct disk_accounting_pos acc_k;
-               bpos_to_disk_accounting_pos(&acc_k, i->pos);
-
-               u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-               memset(v, 0, sizeof(v));
-
-               for (unsigned j = 0; j < i->nr_counters; j++)
-                       v[j] = percpu_u64_get(i->v[0] + j);
-
-               /*
-                * If the entry counters are zeroed, it should be treated as
-                * nonexistent - it might point to an invalid device.
-                *
-                * Remove it, so that if it's re-added it gets re-marked in the
-                * superblock:
-                */
-               ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
-                       ? -BCH_ERR_remove_disk_accounting_entry
-                       : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
-
-               if (ret == -BCH_ERR_remove_disk_accounting_entry) {
-                       free_percpu(i->v[0]);
-                       free_percpu(i->v[1]);
-                       darray_remove_item(&acc->k, i);
-                       ret = 0;
-                       continue;
-               }
-
-               if (ret)
-                       goto fsck_err;
-       }
-
-       eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-                       accounting_pos_cmp, NULL);
-
-       preempt_disable();
-       struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
-
-       for (unsigned i = 0; i < acc->k.nr; i++) {
-               struct disk_accounting_pos k;
-               bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
-
-               u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-               bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
-
-               switch (k.type) {
-               case BCH_DISK_ACCOUNTING_persistent_reserved:
-                       usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
-                       break;
-               case BCH_DISK_ACCOUNTING_replicas:
-                       fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
-                       break;
-               case BCH_DISK_ACCOUNTING_dev_data_type: {
-                       guard(rcu)();
-                       struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
-                       if (ca) {
-                               struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
-                               percpu_u64_set(&d->buckets,     v[0]);
-                               percpu_u64_set(&d->sectors,     v[1]);
-                               percpu_u64_set(&d->fragmented,  v[2]);
-
-                               if (k.dev_data_type.data_type == BCH_DATA_sb ||
-                                   k.dev_data_type.data_type == BCH_DATA_journal)
-                                       usage->hidden += v[0] * ca->mi.bucket_size;
-                       }
-                       break;
-               }
-               }
-       }
-       preempt_enable();
-fsck_err:
-       percpu_up_write(&c->mark_lock);
-err:
-       printbuf_exit(&buf);
-       bch2_trans_put(trans);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
-{
-       return bch2_trans_run(c,
-               bch2_btree_write_buffer_flush_sync(trans) ?:
-               for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN,
-                               BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({
-                       struct disk_accounting_pos acc;
-                       bpos_to_disk_accounting_pos(&acc, k.k->p);
-
-                       acc.type == BCH_DISK_ACCOUNTING_dev_data_type &&
-                       acc.dev_data_type.dev == dev
-                               ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0)
-                               : 0;
-               })) ?:
-               bch2_btree_write_buffer_flush_sync(trans));
-}
-
-int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
-{
-       struct bch_fs *c = ca->fs;
-       u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
-
-       int ret = bch2_trans_do(c, ({
-               bch2_disk_accounting_mod2(trans, gc,
-                                         v, dev_data_type,
-                                         .dev = ca->dev_idx,
-                                         .data_type = BCH_DATA_free) ?:
-               (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
-       }));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-void bch2_verify_accounting_clean(struct bch_fs *c)
-{
-       bool mismatch = false;
-       struct bch_fs_usage_base base = {}, base_inmem = {};
-
-       bch2_trans_run(c,
-               for_each_btree_key(trans, iter,
-                                  BTREE_ID_accounting, POS_MIN,
-                                  BTREE_ITER_all_snapshots, k, ({
-                       u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-                       struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
-                       unsigned nr = bch2_accounting_counters(k.k);
-
-                       struct disk_accounting_pos acc_k;
-                       bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
-                       if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
-                               break;
-
-                       if (!bch2_accounting_is_mem(&acc_k)) {
-                               struct disk_accounting_pos next;
-                               memset(&next, 0, sizeof(next));
-                               next.type = acc_k.type + 1;
-                               bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
-                               continue;
-                       }
-
-                       bch2_accounting_mem_read(c, k.k->p, v, nr);
-
-                       if (memcmp(a.v->d, v, nr * sizeof(u64))) {
-                               struct printbuf buf = PRINTBUF;
-
-                               bch2_bkey_val_to_text(&buf, c, k);
-                               prt_str(&buf, " !=");
-                               for (unsigned j = 0; j < nr; j++)
-                                       prt_printf(&buf, " %llu", v[j]);
-
-                               pr_err("%s", buf.buf);
-                               printbuf_exit(&buf);
-                               mismatch = true;
-                       }
-
-                       switch (acc_k.type) {
-                       case BCH_DISK_ACCOUNTING_persistent_reserved:
-                               base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
-                               break;
-                       case BCH_DISK_ACCOUNTING_replicas:
-                               fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
-                               break;
-                       case BCH_DISK_ACCOUNTING_dev_data_type:
-                               {
-                                       guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */
-                                       struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
-                                       if (!ca)
-                                               continue;
-
-                                       v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
-                                       v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
-                                       v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
-                               }
-
-                               if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
-                                       struct printbuf buf = PRINTBUF;
-
-                                       bch2_bkey_val_to_text(&buf, c, k);
-                                       prt_str(&buf, " in mem");
-                                       for (unsigned j = 0; j < nr; j++)
-                                               prt_printf(&buf, " %llu", v[j]);
-
-                                       pr_err("dev accounting mismatch: %s", buf.buf);
-                                       printbuf_exit(&buf);
-                                       mismatch = true;
-                               }
-                       }
-
-                       0;
-               })));
-
-       acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64));
-
-#define check(x)                                                                               \
-       if (base.x != base_inmem.x) {                                                           \
-               pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x);    \
-               mismatch = true;                                                                \
-       }
-
-       //check(hidden);
-       check(btree);
-       check(data);
-       check(cached);
-       check(reserved);
-       check(nr_inodes);
-
-       WARN_ON(mismatch);
-}
-
-void bch2_accounting_gc_free(struct bch_fs *c)
-{
-       lockdep_assert_held(&c->mark_lock);
-
-       struct bch_accounting_mem *acc = &c->accounting;
-
-       bch2_accounting_free_counters(acc, true);
-       acc->gc_running = false;
-}
-
-void bch2_fs_accounting_exit(struct bch_fs *c)
-{
-       struct bch_accounting_mem *acc = &c->accounting;
-
-       bch2_accounting_free_counters(acc, false);
-       darray_exit(&acc->k);
-}
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
deleted file mode 100644 (file)
index d61abeb..0000000
+++ /dev/null
@@ -1,301 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_H
-#define _BCACHEFS_DISK_ACCOUNTING_H
-
-#include "btree_update.h"
-#include "eytzinger.h"
-#include "sb-members.h"
-
-static inline void bch2_u64s_neg(u64 *v, unsigned nr)
-{
-       for (unsigned i = 0; i < nr; i++)
-               v[i] = -v[i];
-}
-
-static inline unsigned bch2_accounting_counters(const struct bkey *k)
-{
-       return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
-}
-
-static inline void bch2_accounting_neg(struct bkey_s_accounting a)
-{
-       bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k));
-}
-
-static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
-{
-       for (unsigned i = 0;  i < bch2_accounting_counters(a.k); i++)
-               if (a.v->d[i])
-                       return false;
-       return true;
-}
-
-static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
-                                             struct bkey_s_c_accounting src)
-{
-       for (unsigned i = 0;
-            i < min(bch2_accounting_counters(&dst->k),
-                    bch2_accounting_counters(src.k));
-            i++)
-               dst->v.d[i] += src.v->d[i];
-
-       if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
-               dst->k.bversion = src.k->bversion;
-}
-
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
-                                             enum bch_data_type data_type,
-                                             s64 sectors)
-{
-       switch (data_type) {
-       case BCH_DATA_btree:
-               fs_usage->btree         += sectors;
-               break;
-       case BCH_DATA_user:
-       case BCH_DATA_parity:
-               fs_usage->data          += sectors;
-               break;
-       case BCH_DATA_cached:
-               fs_usage->cached        += sectors;
-               break;
-       default:
-               break;
-       }
-}
-
-static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
-{
-       BUILD_BUG_ON(sizeof(*acc) != sizeof(p));
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-       acc->_pad = p;
-#else
-       memcpy_swab(acc, &p, sizeof(p));
-#endif
-}
-
-static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc)
-{
-       struct bpos p;
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-       p = acc->_pad;
-#else
-       memcpy_swab(&p, acc, sizeof(p));
-#endif
-       return p;
-}
-
-int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
-                            s64 *, unsigned, bool);
-
-#define disk_accounting_key_init(_k, _type, ...)                       \
-do {                                                                   \
-       memset(&(_k), 0, sizeof(_k));                                   \
-       (_k).type       = BCH_DISK_ACCOUNTING_##_type;                  \
-       (_k)._type      = (struct bch_acct_##_type) { __VA_ARGS__ };    \
-} while (0)
-
-#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...)                \
-({                                                                     \
-       struct disk_accounting_pos pos;                                 \
-       disk_accounting_key_init(pos, __VA_ARGS__);                     \
-       bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc);            \
-})
-
-#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...)                        \
-       bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__)
-
-int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
-
-int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
-                            struct bkey_validate_context);
-void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
-void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_accounting_swab(struct bkey_s);
-
-#define bch2_bkey_ops_accounting ((struct bkey_ops) {  \
-       .key_validate   = bch2_accounting_validate,     \
-       .val_to_text    = bch2_accounting_to_text,      \
-       .swab           = bch2_accounting_swab,         \
-       .min_val_size   = 8,                            \
-})
-
-int bch2_accounting_update_sb(struct btree_trans *);
-
-static inline int accounting_pos_cmp(const void *_l, const void *_r)
-{
-       const struct bpos *l = _l, *r = _r;
-
-       return bpos_cmp(*l, *r);
-}
-
-enum bch_accounting_mode {
-       BCH_ACCOUNTING_normal,
-       BCH_ACCOUNTING_gc,
-       BCH_ACCOUNTING_read,
-};
-
-int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
-int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
-void bch2_accounting_mem_gc(struct bch_fs *);
-
-static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc)
-{
-       return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR &&
-               acc->type != BCH_DISK_ACCOUNTING_inum;
-}
-
-/*
- * Update in memory counters so they match the btree update we're doing; called
- * from transaction commit path
- */
-static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
-                                                struct bkey_s_c_accounting a,
-                                                enum bch_accounting_mode mode,
-                                                bool write_locked)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_accounting_mem *acc = &c->accounting;
-       struct disk_accounting_pos acc_k;
-       bpos_to_disk_accounting_pos(&acc_k, a.k->p);
-       bool gc = mode == BCH_ACCOUNTING_gc;
-
-       if (gc && !acc->gc_running)
-               return 0;
-
-       if (!bch2_accounting_is_mem(&acc_k))
-               return 0;
-
-       if (mode == BCH_ACCOUNTING_normal) {
-               switch (acc_k.type) {
-               case BCH_DISK_ACCOUNTING_persistent_reserved:
-                       trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
-                       break;
-               case BCH_DISK_ACCOUNTING_replicas:
-                       fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
-                       break;
-               case BCH_DISK_ACCOUNTING_dev_data_type: {
-                       guard(rcu)();
-                       struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
-                       if (ca) {
-                               this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
-                               this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
-                               this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
-                       }
-                       break;
-               }
-               }
-       }
-
-       unsigned idx;
-
-       while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-                                     accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
-               int ret = 0;
-               if (unlikely(write_locked))
-                       ret = bch2_accounting_mem_insert_locked(c, a, mode);
-               else
-                       ret = bch2_accounting_mem_insert(c, a, mode);
-               if (ret)
-                       return ret;
-       }
-
-       struct accounting_mem_entry *e = &acc->k.data[idx];
-
-       EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
-
-       for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
-               this_cpu_add(e->v[gc][i], a.v->d[i]);
-       return 0;
-}
-
-static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
-{
-       percpu_down_read(&trans->c->mark_lock);
-       int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false);
-       percpu_up_read(&trans->c->mark_lock);
-       return ret;
-}
-
-static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc,
-                                                    unsigned idx, u64 *v, unsigned nr, bool gc)
-{
-       memset(v, 0, sizeof(*v) * nr);
-
-       if (unlikely(idx >= acc->k.nr))
-               return;
-
-       struct accounting_mem_entry *e = &acc->k.data[idx];
-
-       nr = min_t(unsigned, nr, e->nr_counters);
-
-       for (unsigned i = 0; i < nr; i++)
-               v[i] = percpu_u64_get(e->v[gc] + i);
-}
-
-static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
-                                           u64 *v, unsigned nr)
-{
-       percpu_down_read(&c->mark_lock);
-       struct bch_accounting_mem *acc = &c->accounting;
-       unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-                                      accounting_pos_cmp, &p);
-
-       bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
-       percpu_up_read(&c->mark_lock);
-}
-
-static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
-{
-       EBUG_ON(!res->ref);
-
-       return (struct bversion) {
-               .hi = res->seq >> 32,
-               .lo = (res->seq << 32) | (res->offset + offset),
-       };
-}
-
-static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
-                                                   struct bkey_i_accounting *a,
-                                                   unsigned commit_flags)
-{
-       u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
-       a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base);
-
-       EBUG_ON(bversion_zero(a->k.bversion));
-
-       return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
-               ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false)
-               : 0;
-}
-
-static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans,
-                                                      struct bkey_i_accounting *a_i,
-                                                      unsigned commit_flags)
-{
-       if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
-               struct bkey_s_accounting a = accounting_i_to_s(a_i);
-
-               bch2_accounting_neg(a);
-               bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false);
-               bch2_accounting_neg(a);
-       }
-}
-
-int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
-int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
-
-int bch2_gc_accounting_start(struct bch_fs *);
-int bch2_gc_accounting_done(struct bch_fs *);
-
-int bch2_accounting_read(struct bch_fs *);
-
-int bch2_dev_usage_remove(struct bch_fs *, unsigned);
-int bch2_dev_usage_init(struct bch_dev *, bool);
-
-void bch2_verify_accounting_clean(struct bch_fs *c);
-
-void bch2_accounting_gc_free(struct bch_fs *);
-void bch2_fs_accounting_exit(struct bch_fs *);
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
deleted file mode 100644 (file)
index 8269af1..0000000
+++ /dev/null
@@ -1,225 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
-#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
-
-#include "replicas_format.h"
-
-/*
- * Disk accounting - KEY_TYPE_accounting - on disk format:
- *
- * Here, the key has considerably more structure than a typical key (bpos); an
- * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
- *
- * More specifically: a key is just a muliword integer (where word endianness
- * matches native byte order), so we're treating bpos as an opaque 20 byte
- * integer and mapping bch_accounting_key to that.
- *
- * This is a type-tagged union of all our various subtypes; a disk accounting
- * key can be device counters, replicas counters, et cetera - it's extensible.
- *
- * The value is a list of u64s or s64s; the number of counters is specific to a
- * given accounting type.
- *
- * Unlike with other key types, updates are _deltas_, and the deltas are not
- * resolved until the update to the underlying btree, done by btree write buffer
- * flush or journal replay.
- *
- * Journal replay in particular requires special handling. The journal tracks a
- * range of entries which may possibly have not yet been applied to the btree
- * yet - it does not know definitively whether individual entries are dirty and
- * still need to be applied.
- *
- * To handle this, we use the version field of struct bkey, and give every
- * accounting update a unique version number - a total ordering in time; the
- * version number is derived from the key's position in the journal. Then
- * journal replay can compare the version number of the key from the journal
- * with the version number of the key in the btree to determine if a key needs
- * to be replayed.
- *
- * For this to work, we must maintain this strict time ordering of updates as
- * they are flushed to the btree, both via write buffer flush and via journal
- * replay. This has complications for the write buffer code while journal replay
- * is still in progress; the write buffer cannot flush any accounting keys to
- * the btree until journal replay has finished replaying its accounting keys, or
- * the (newer) version number of the keys from the write buffer will cause
- * updates from journal replay to be lost.
- */
-
-struct bch_accounting {
-       struct bch_val          v;
-       __u64                   d[];
-};
-
-#define BCH_ACCOUNTING_MAX_COUNTERS            3
-
-#define BCH_DATA_TYPES()               \
-       x(free,         0)              \
-       x(sb,           1)              \
-       x(journal,      2)              \
-       x(btree,        3)              \
-       x(user,         4)              \
-       x(cached,       5)              \
-       x(parity,       6)              \
-       x(stripe,       7)              \
-       x(need_gc_gens, 8)              \
-       x(need_discard, 9)              \
-       x(unstriped,    10)
-
-enum bch_data_type {
-#define x(t, n) BCH_DATA_##t,
-       BCH_DATA_TYPES()
-#undef x
-       BCH_DATA_NR
-};
-
-static inline bool data_type_is_empty(enum bch_data_type type)
-{
-       switch (type) {
-       case BCH_DATA_free:
-       case BCH_DATA_need_gc_gens:
-       case BCH_DATA_need_discard:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool data_type_is_hidden(enum bch_data_type type)
-{
-       switch (type) {
-       case BCH_DATA_sb:
-       case BCH_DATA_journal:
-               return true;
-       default:
-               return false;
-       }
-}
-
-/*
- * field 1: name
- * field 2: id
- * field 3: number of counters (max 3)
- */
-
-#define BCH_DISK_ACCOUNTING_TYPES()            \
-       x(nr_inodes,            0,      1)      \
-       x(persistent_reserved,  1,      1)      \
-       x(replicas,             2,      1)      \
-       x(dev_data_type,        3,      3)      \
-       x(compression,          4,      3)      \
-       x(snapshot,             5,      1)      \
-       x(btree,                6,      1)      \
-       x(rebalance_work,       7,      1)      \
-       x(inum,                 8,      3)
-
-enum disk_accounting_type {
-#define x(f, nr, ...)  BCH_DISK_ACCOUNTING_##f = nr,
-       BCH_DISK_ACCOUNTING_TYPES()
-#undef x
-       BCH_DISK_ACCOUNTING_TYPE_NR,
-};
-
-/*
- * No subtypes - number of inodes in the entire filesystem
- *
- * XXX: perhaps we could add a per-subvolume counter?
- */
-struct bch_acct_nr_inodes {
-};
-
-/*
- * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the
- * reservation:
- */
-struct bch_acct_persistent_reserved {
-       __u8                    nr_replicas;
-};
-
-/*
- * device, data type counter fields:
- * [
- *   nr_buckets
- *   live sectors (in buckets of that data type)
- *   sectors of internal fragmentation
- * ]
- *
- * XXX: live sectors should've been done differently, you can have multiple data
- * types in the same bucket (user, stripe, cached) and this collapses them to
- * the bucket data type, and makes the internal fragmentation counter redundant
- */
-struct bch_acct_dev_data_type {
-       __u8                    dev;
-       __u8                    data_type;
-};
-
-/*
- * Compression type fields:
- * [
- *   number of extents
- *   uncompressed size
- *   compressed size
- * ]
- *
- * Compression ratio, average extent size (fragmentation).
- */
-struct bch_acct_compression {
-       __u8                    type;
-};
-
-/*
- * On disk usage by snapshot id; counts same values as replicas counter, but
- * aggregated differently
- */
-struct bch_acct_snapshot {
-       __u32                   id;
-} __packed;
-
-struct bch_acct_btree {
-       __u32                   id;
-} __packed;
-
-/*
- * inum counter fields:
- * [
- *   number of extents
- *   sum of extent sizes - bkey size
- *     this field is similar to inode.bi_sectors, except here extents in
- *     different snapshots but the same inode number are all collapsed to the
- *     same counter
- *   sum of on disk size - same values tracked by replicas counters
- * ]
- *
- * This tracks on disk fragmentation.
- */
-struct bch_acct_inum {
-       __u64                   inum;
-} __packed;
-
-/*
- * Simple counter of the amount of data (on disk sectors) rebalance needs to
- * move, extents counted here are also in the rebalance_work btree.
- */
-struct bch_acct_rebalance_work {
-};
-
-struct disk_accounting_pos {
-       union {
-       struct {
-               __u8                            type;
-               union {
-               struct bch_acct_nr_inodes       nr_inodes;
-               struct bch_acct_persistent_reserved     persistent_reserved;
-               struct bch_replicas_entry_v1    replicas;
-               struct bch_acct_dev_data_type   dev_data_type;
-               struct bch_acct_compression     compression;
-               struct bch_acct_snapshot        snapshot;
-               struct bch_acct_btree           btree;
-               struct bch_acct_rebalance_work  rebalance_work;
-               struct bch_acct_inum            inum;
-               } __packed;
-       } __packed;
-               struct bpos                     _pad;
-       };
-};
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
deleted file mode 100644 (file)
index b198213..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
-#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
-
-#include "darray.h"
-
-struct accounting_mem_entry {
-       struct bpos                             pos;
-       struct bversion                         bversion;
-       unsigned                                nr_counters;
-       u64 __percpu                            *v[2];
-};
-
-struct bch_accounting_mem {
-       DARRAY(struct accounting_mem_entry)     k;
-       bool                                    gc_running;
-};
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
deleted file mode 100644 (file)
index cde842a..0000000
+++ /dev/null
@@ -1,591 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "disk_groups.h"
-#include "sb-members.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-
-static int group_cmp(const void *_l, const void *_r)
-{
-       const struct bch_disk_group *l = _l;
-       const struct bch_disk_group *r = _r;
-
-       return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
-               (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
-               ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
-                (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
-               strncmp(l->label, r->label, sizeof(l->label));
-}
-
-static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_disk_groups *groups =
-               field_to_type(f, disk_groups);
-       struct bch_disk_group *g, *sorted = NULL;
-       unsigned nr_groups = disk_groups_nr(groups);
-       unsigned i, len;
-       int ret = 0;
-
-       for (i = 0; i < sb->nr_devices; i++) {
-               struct bch_member m = bch2_sb_member_get(sb, i);
-               unsigned group_id;
-
-               if (!BCH_MEMBER_GROUP(&m))
-                       continue;
-
-               group_id = BCH_MEMBER_GROUP(&m) - 1;
-
-               if (group_id >= nr_groups) {
-                       prt_printf(err, "disk %u has invalid label %u (have %u)",
-                                  i, group_id, nr_groups);
-                       return -BCH_ERR_invalid_sb_disk_groups;
-               }
-
-               if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
-                       prt_printf(err, "disk %u has deleted label %u", i, group_id);
-                       return -BCH_ERR_invalid_sb_disk_groups;
-               }
-       }
-
-       if (!nr_groups)
-               return 0;
-
-       for (i = 0; i < nr_groups; i++) {
-               g = groups->entries + i;
-
-               if (BCH_GROUP_DELETED(g))
-                       continue;
-
-               len = strnlen(g->label, sizeof(g->label));
-               if (!len) {
-                       prt_printf(err, "label %u empty", i);
-                       return -BCH_ERR_invalid_sb_disk_groups;
-               }
-       }
-
-       sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
-       if (!sorted)
-               return -BCH_ERR_ENOMEM_disk_groups_validate;
-
-       memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
-       sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
-
-       for (g = sorted; g + 1 < sorted + nr_groups; g++)
-               if (!BCH_GROUP_DELETED(g) &&
-                   !group_cmp(&g[0], &g[1])) {
-                       prt_printf(err, "duplicate label %llu.%.*s",
-                              BCH_GROUP_PARENT(g),
-                              (int) sizeof(g->label), g->label);
-                       ret = -BCH_ERR_invalid_sb_disk_groups;
-                       goto err;
-               }
-err:
-       kfree(sorted);
-       return ret;
-}
-
-static void bch2_sb_disk_groups_to_text(struct printbuf *out,
-                                       struct bch_sb *sb,
-                                       struct bch_sb_field *f)
-{
-       struct bch_sb_field_disk_groups *groups =
-               field_to_type(f, disk_groups);
-       struct bch_disk_group *g;
-       unsigned nr_groups = disk_groups_nr(groups);
-
-       for (g = groups->entries;
-            g < groups->entries + nr_groups;
-            g++) {
-               if (g != groups->entries)
-                       prt_printf(out, " ");
-
-               if (BCH_GROUP_DELETED(g))
-                       prt_printf(out, "[deleted]");
-               else
-                       prt_printf(out, "[parent %llu name %s]",
-                              BCH_GROUP_PARENT(g), g->label);
-       }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
-       .validate       = bch2_sb_disk_groups_validate,
-       .to_text        = bch2_sb_disk_groups_to_text
-};
-
-int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
-{
-       struct bch_sb_field_disk_groups *groups;
-       struct bch_disk_groups_cpu *cpu_g, *old_g;
-       unsigned i, g, nr_groups;
-
-       lockdep_assert_held(&c->sb_lock);
-
-       groups          = bch2_sb_field_get(c->disk_sb.sb, disk_groups);
-       nr_groups       = disk_groups_nr(groups);
-
-       if (!groups)
-               return 0;
-
-       cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
-       if (!cpu_g)
-               return bch_err_throw(c, ENOMEM_disk_groups_to_cpu);
-
-       cpu_g->nr = nr_groups;
-
-       for (i = 0; i < nr_groups; i++) {
-               struct bch_disk_group *src      = &groups->entries[i];
-               struct bch_disk_group_cpu *dst  = &cpu_g->entries[i];
-
-               dst->deleted    = BCH_GROUP_DELETED(src);
-               dst->parent     = BCH_GROUP_PARENT(src);
-               memcpy(dst->label, src->label, sizeof(dst->label));
-       }
-
-       for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-               struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
-               struct bch_disk_group_cpu *dst;
-
-               if (!bch2_member_alive(&m))
-                       continue;
-
-               g = BCH_MEMBER_GROUP(&m);
-               while (g) {
-                       dst = &cpu_g->entries[g - 1];
-                       __set_bit(i, dst->devs.d);
-                       g = dst->parent;
-               }
-       }
-
-       old_g = rcu_dereference_protected(c->disk_groups,
-                               lockdep_is_held(&c->sb_lock));
-       rcu_assign_pointer(c->disk_groups, cpu_g);
-       if (old_g)
-               kfree_rcu(old_g, rcu);
-
-       return 0;
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
-{
-       struct target t = target_decode(target);
-
-       guard(rcu)();
-
-       switch (t.type) {
-       case TARGET_NULL:
-               return NULL;
-       case TARGET_DEV: {
-               struct bch_dev *ca = t.dev < c->sb.nr_devices
-                       ? rcu_dereference(c->devs[t.dev])
-                       : NULL;
-               return ca ? &ca->self : NULL;
-       }
-       case TARGET_GROUP: {
-               struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-
-               return g && t.group < g->nr && !g->entries[t.group].deleted
-                       ? &g->entries[t.group].devs
-                       : NULL;
-       }
-       default:
-               BUG();
-       }
-}
-
-bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
-       struct target t = target_decode(target);
-
-       switch (t.type) {
-       case TARGET_NULL:
-               return false;
-       case TARGET_DEV:
-               return dev == t.dev;
-       case TARGET_GROUP: {
-               struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-               const struct bch_devs_mask *m =
-                       g && t.group < g->nr && !g->entries[t.group].deleted
-                       ? &g->entries[t.group].devs
-                       : NULL;
-
-               return m ? test_bit(dev, m->d) : false;
-       }
-       default:
-               BUG();
-       }
-}
-
-static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
-                                 unsigned parent,
-                                 const char *name, unsigned namelen)
-{
-       unsigned i, nr_groups = disk_groups_nr(groups);
-
-       if (!namelen || namelen > BCH_SB_LABEL_SIZE)
-               return -EINVAL;
-
-       for (i = 0; i < nr_groups; i++) {
-               struct bch_disk_group *g = groups->entries + i;
-
-               if (BCH_GROUP_DELETED(g))
-                       continue;
-
-               if (!BCH_GROUP_DELETED(g) &&
-                   BCH_GROUP_PARENT(g) == parent &&
-                   strnlen(g->label, sizeof(g->label)) == namelen &&
-                   !memcmp(name, g->label, namelen))
-                       return i;
-       }
-
-       return -1;
-}
-
-static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
-                                const char *name, unsigned namelen)
-{
-       struct bch_sb_field_disk_groups *groups =
-               bch2_sb_field_get(sb->sb, disk_groups);
-       unsigned i, nr_groups = disk_groups_nr(groups);
-       struct bch_disk_group *g;
-
-       if (!namelen || namelen > BCH_SB_LABEL_SIZE)
-               return -EINVAL;
-
-       for (i = 0;
-            i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
-            i++)
-               ;
-
-       if (i == nr_groups) {
-               unsigned u64s =
-                       (sizeof(struct bch_sb_field_disk_groups) +
-                        sizeof(struct bch_disk_group) * (nr_groups + 1)) /
-                       sizeof(u64);
-
-               groups = bch2_sb_field_resize(sb, disk_groups, u64s);
-               if (!groups)
-                       return -BCH_ERR_ENOSPC_disk_label_add;
-
-               nr_groups = disk_groups_nr(groups);
-       }
-
-       BUG_ON(i >= nr_groups);
-
-       g = &groups->entries[i];
-
-       memcpy(g->label, name, namelen);
-       if (namelen < sizeof(g->label))
-               g->label[namelen] = '\0';
-       SET_BCH_GROUP_DELETED(g, 0);
-       SET_BCH_GROUP_PARENT(g, parent);
-       SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
-
-       return i;
-}
-
-int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
-{
-       struct bch_sb_field_disk_groups *groups =
-               bch2_sb_field_get(sb->sb, disk_groups);
-       int v = -1;
-
-       do {
-               const char *next = strchrnul(name, '.');
-               unsigned len = next - name;
-
-               if (*next == '.')
-                       next++;
-
-               v = __bch2_disk_group_find(groups, v + 1, name, len);
-               name = next;
-       } while (*name && v >= 0);
-
-       return v;
-}
-
-int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
-{
-       struct bch_sb_field_disk_groups *groups;
-       unsigned parent = 0;
-       int v = -1;
-
-       do {
-               const char *next = strchrnul(name, '.');
-               unsigned len = next - name;
-
-               if (*next == '.')
-                       next++;
-
-               groups = bch2_sb_field_get(sb->sb, disk_groups);
-
-               v = __bch2_disk_group_find(groups, parent, name, len);
-               if (v < 0)
-                       v = __bch2_disk_group_add(sb, parent, name, len);
-               if (v < 0)
-                       return v;
-
-               parent = v + 1;
-               name = next;
-       } while (*name && v >= 0);
-
-       return v;
-}
-
-static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g,
-                                    unsigned v)
-{
-       u16 path[32];
-       unsigned nr = 0;
-
-       while (1) {
-               if (nr == ARRAY_SIZE(path))
-                       goto invalid;
-
-               if (v >= (g ? g->nr : 0))
-                       goto invalid;
-
-               struct bch_disk_group_cpu *e = g->entries + v;
-
-               if (e->deleted)
-                       goto invalid;
-
-               path[nr++] = v;
-
-               if (!e->parent)
-                       break;
-
-               v = e->parent - 1;
-       }
-
-       while (nr) {
-               struct bch_disk_group_cpu *e = g->entries + path[--nr];
-
-               prt_printf(out, "%.*s", (int) sizeof(e->label), e->label);
-               if (nr)
-                       prt_printf(out, ".");
-       }
-       return;
-invalid:
-       prt_printf(out, "invalid label %u", v);
-}
-
-void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       bch2_printbuf_make_room(out, 4096);
-
-       out->atomic++;
-       guard(rcu)();
-       struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-
-       for (unsigned i = 0; i < (g ? g->nr : 0); i++) {
-               prt_printf(out, "%2u: ", i);
-
-               if (g->entries[i].deleted) {
-                       prt_printf(out, "[deleted]");
-                       goto next;
-               }
-
-               __bch2_disk_path_to_text(out, g, i);
-
-               prt_printf(out, " devs");
-
-               for_each_member_device_rcu(c, ca, &g->entries[i].devs)
-                       prt_printf(out, " %s", ca->name);
-next:
-               prt_newline(out);
-       }
-
-       out->atomic--;
-}
-
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
-{
-       out->atomic++;
-       guard(rcu)();
-       __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v),
-       --out->atomic;
-}
-
-void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
-{
-       struct bch_sb_field_disk_groups *groups =
-               bch2_sb_field_get(sb, disk_groups);
-       struct bch_disk_group *g;
-       unsigned nr = 0;
-       u16 path[32];
-
-       while (1) {
-               if (nr == ARRAY_SIZE(path))
-                       goto inval;
-
-               if (v >= disk_groups_nr(groups))
-                       goto inval;
-
-               g = groups->entries + v;
-
-               if (BCH_GROUP_DELETED(g))
-                       goto inval;
-
-               path[nr++] = v;
-
-               if (!BCH_GROUP_PARENT(g))
-                       break;
-
-               v = BCH_GROUP_PARENT(g) - 1;
-       }
-
-       while (nr) {
-               v = path[--nr];
-               g = groups->entries + v;
-
-               prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
-               if (nr)
-                       prt_printf(out, ".");
-       }
-       return;
-inval:
-       prt_printf(out, "invalid label %u", v);
-}
-
-int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
-{
-       lockdep_assert_held(&c->sb_lock);
-
-
-       if (!strlen(name) || !strcmp(name, "none")) {
-               struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-               SET_BCH_MEMBER_GROUP(mi, 0);
-       } else {
-               int v = bch2_disk_path_find_or_create(&c->disk_sb, name);
-               if (v < 0)
-                       return v;
-
-               struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-               SET_BCH_MEMBER_GROUP(mi, v + 1);
-       }
-
-       return bch2_sb_disk_groups_to_cpu(c);
-}
-
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
-{
-       int ret;
-
-       mutex_lock(&c->sb_lock);
-       ret = __bch2_dev_group_set(c, ca, name) ?:
-               bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       return ret;
-}
-
-int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
-                         struct printbuf *err)
-{
-       struct bch_dev *ca;
-       int g;
-
-       if (!val)
-               return -EINVAL;
-
-       if (!c)
-               return -BCH_ERR_option_needs_open_fs;
-
-       if (!strlen(val) || !strcmp(val, "none")) {
-               *res = 0;
-               return 0;
-       }
-
-       /* Is it a device? */
-       ca = bch2_dev_lookup(c, val);
-       if (!IS_ERR(ca)) {
-               *res = dev_to_target(ca->dev_idx);
-               bch2_dev_put(ca);
-               return 0;
-       }
-
-       mutex_lock(&c->sb_lock);
-       g = bch2_disk_path_find(&c->disk_sb, val);
-       mutex_unlock(&c->sb_lock);
-
-       if (g >= 0) {
-               *res = group_to_target(g);
-               return 0;
-       }
-
-       return -EINVAL;
-}
-
-void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
-{
-       struct target t = target_decode(v);
-
-       switch (t.type) {
-       case TARGET_NULL:
-               prt_printf(out, "none");
-               return;
-       case TARGET_DEV: {
-               out->atomic++;
-               guard(rcu)();
-               struct bch_dev *ca = t.dev < c->sb.nr_devices
-                       ? rcu_dereference(c->devs[t.dev])
-                       : NULL;
-
-               if (ca && ca->disk_sb.bdev)
-                       prt_printf(out, "/dev/%s", ca->name);
-               else if (ca)
-                       prt_printf(out, "offline device %u", t.dev);
-               else
-                       prt_printf(out, "invalid device %u", t.dev);
-
-               out->atomic--;
-               return;
-       }
-       case TARGET_GROUP:
-               bch2_disk_path_to_text(out, c, t.group);
-               return;
-       default:
-               BUG();
-       }
-}
-
-static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
-{
-       struct target t = target_decode(v);
-
-       switch (t.type) {
-       case TARGET_NULL:
-               prt_printf(out, "none");
-               break;
-       case TARGET_DEV: {
-               struct bch_member m = bch2_sb_member_get(sb, t.dev);
-
-               if (bch2_member_exists(sb, t.dev)) {
-                       prt_printf(out, "Device ");
-                       pr_uuid(out, m.uuid.b);
-                       prt_printf(out, " (%u)", t.dev);
-               } else {
-                       prt_printf(out, "Bad device %u", t.dev);
-               }
-               break;
-       }
-       case TARGET_GROUP:
-               bch2_disk_path_to_text_sb(out, sb, t.group);
-               break;
-       default:
-               BUG();
-       }
-}
-
-void bch2_opt_target_to_text(struct printbuf *out,
-                            struct bch_fs *c,
-                            struct bch_sb *sb,
-                            u64 v)
-{
-       if (c)
-               bch2_target_to_text(out, c, v);
-       else
-               bch2_target_to_text_sb(out, sb, v);
-}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
deleted file mode 100644 (file)
index 441826f..0000000
+++ /dev/null
@@ -1,111 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_H
-#define _BCACHEFS_DISK_GROUPS_H
-
-#include "disk_groups_types.h"
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
-
-static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
-{
-       return groups
-               ? (vstruct_end(&groups->field) -
-                  (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
-               : 0;
-}
-
-struct target {
-       enum {
-               TARGET_NULL,
-               TARGET_DEV,
-               TARGET_GROUP,
-       }                       type;
-       union {
-               unsigned        dev;
-               unsigned        group;
-       };
-};
-
-#define TARGET_DEV_START       1
-#define TARGET_GROUP_START     (256 + TARGET_DEV_START)
-
-static inline u16 dev_to_target(unsigned dev)
-{
-       return TARGET_DEV_START + dev;
-}
-
-static inline u16 group_to_target(unsigned group)
-{
-       return TARGET_GROUP_START + group;
-}
-
-static inline struct target target_decode(unsigned target)
-{
-       if (target >= TARGET_GROUP_START)
-               return (struct target) {
-                       .type   = TARGET_GROUP,
-                       .group  = target - TARGET_GROUP_START
-               };
-
-       if (target >= TARGET_DEV_START)
-               return (struct target) {
-                       .type   = TARGET_DEV,
-                       .group  = target - TARGET_DEV_START
-               };
-
-       return (struct target) { .type = TARGET_NULL };
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
-
-static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
-                                                 enum bch_data_type data_type,
-                                                 u16 target)
-{
-       struct bch_devs_mask devs = c->rw_devs[data_type];
-       const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
-
-       if (t)
-               bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
-       return devs;
-}
-
-static inline bool bch2_target_accepts_data(struct bch_fs *c,
-                                           enum bch_data_type data_type,
-                                           u16 target)
-{
-       struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target);
-       return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX);
-}
-
-bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
-
-int bch2_disk_path_find(struct bch_sb_handle *, const char *);
-
-/* Exported for userspace bcachefs-tools: */
-int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-
-void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
-void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
-
-void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
-
-int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-
-#define bch2_opt_target (struct bch_opt_fn) {          \
-       .parse          = bch2_opt_target_parse,        \
-       .to_text        = bch2_opt_target_to_text,      \
-}
-
-int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
-
-int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
-int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
-
-const char *bch2_sb_validate_disk_groups(struct bch_sb *,
-                                        struct bch_sb_field *);
-
-void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *);
-
-#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h
deleted file mode 100644 (file)
index 698990b..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H
-#define _BCACHEFS_DISK_GROUPS_FORMAT_H
-
-#define BCH_SB_LABEL_SIZE              32
-
-struct bch_disk_group {
-       __u8                    label[BCH_SB_LABEL_SIZE];
-       __le64                  flags[2];
-} __packed __aligned(8);
-
-LE64_BITMASK(BCH_GROUP_DELETED,                struct bch_disk_group, flags[0], 0,  1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,   struct bch_disk_group, flags[0], 1,  6)
-LE64_BITMASK(BCH_GROUP_PARENT,         struct bch_disk_group, flags[0], 6, 24)
-
-struct bch_sb_field_disk_groups {
-       struct bch_sb_field     field;
-       struct bch_disk_group   entries[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */
diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h
deleted file mode 100644 (file)
index a54ef08..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
-#define _BCACHEFS_DISK_GROUPS_TYPES_H
-
-struct bch_disk_group_cpu {
-       bool                            deleted;
-       u16                             parent;
-       u8                              label[BCH_SB_LABEL_SIZE];
-       struct bch_devs_mask            devs;
-};
-
-struct bch_disk_groups_cpu {
-       struct rcu_head                 rcu;
-       unsigned                        nr;
-       struct bch_disk_group_cpu       entries[] __counted_by(nr);
-};
-
-#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
deleted file mode 100644 (file)
index 543dbba..0000000
+++ /dev/null
@@ -1,2405 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/* erasure coding */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "keylist.h"
-#include "lru.h"
-#include "recovery.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "util.h"
-
-#include <linux/sort.h>
-#include <linux/string_choices.h>
-
-#ifdef __KERNEL__
-
-#include <linux/raid/pq.h>
-#include <linux/raid/xor.h>
-
-static void raid5_recov(unsigned disks, unsigned failed_idx,
-                       size_t size, void **data)
-{
-       unsigned i = 2, nr;
-
-       BUG_ON(failed_idx >= disks);
-
-       swap(data[0], data[failed_idx]);
-       memcpy(data[0], data[1], size);
-
-       while (i < disks) {
-               nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
-               xor_blocks(nr, size, data[0], data + i);
-               i += nr;
-       }
-
-       swap(data[0], data[failed_idx]);
-}
-
-static void raid_gen(int nd, int np, size_t size, void **v)
-{
-       if (np >= 1)
-               raid5_recov(nd + np, nd, size, v);
-       if (np >= 2)
-               raid6_call.gen_syndrome(nd + np, size, v);
-       BUG_ON(np > 2);
-}
-
-static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
-{
-       switch (nr) {
-       case 0:
-               break;
-       case 1:
-               if (ir[0] < nd + 1)
-                       raid5_recov(nd + 1, ir[0], size, v);
-               else
-                       raid6_call.gen_syndrome(nd + np, size, v);
-               break;
-       case 2:
-               if (ir[1] < nd) {
-                       /* data+data failure. */
-                       raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
-               } else if (ir[0] < nd) {
-                       /* data + p/q failure */
-
-                       if (ir[1] == nd) /* data + p failure */
-                               raid6_datap_recov(nd + np, size, ir[0], v);
-                       else { /* data + q failure */
-                               raid5_recov(nd + 1, ir[0], size, v);
-                               raid6_call.gen_syndrome(nd + np, size, v);
-                       }
-               } else {
-                       raid_gen(nd, np, size, v);
-               }
-               break;
-       default:
-               BUG();
-       }
-}
-
-#else
-
-#include <raid/raid.h>
-
-#endif
-
-struct ec_bio {
-       struct bch_dev          *ca;
-       struct ec_stripe_buf    *buf;
-       size_t                  idx;
-       int                     rw;
-       u64                     submit_time;
-       struct bio              bio;
-};
-
-/* Stripes btree keys: */
-
-int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
-                        struct bkey_validate_context from)
-{
-       const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
-                        bpos_gt(k.k->p, POS(0, U32_MAX)),
-                        c, stripe_pos_bad,
-                        "stripe at bad pos");
-
-       bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
-                        c, stripe_val_size_bad,
-                        "incorrect value size (%zu < %u)",
-                        bkey_val_u64s(k.k), stripe_val_u64s(s));
-
-       bkey_fsck_err_on(s->csum_granularity_bits >= 64,
-                        c, stripe_csum_granularity_bad,
-                        "invalid csum granularity (%u >= 64)",
-                        s->csum_granularity_bits);
-
-       ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
-       return ret;
-}
-
-void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
-                        struct bkey_s_c k)
-{
-       const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
-       struct bch_stripe s = {};
-
-       memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
-
-       unsigned nr_data = s.nr_blocks - s.nr_redundant;
-
-       prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
-                  s.algorithm,
-                  le16_to_cpu(s.sectors),
-                  nr_data,
-                  s.nr_redundant);
-       bch2_prt_csum_type(out, s.csum_type);
-       prt_str(out, " gran ");
-       if (s.csum_granularity_bits < 64)
-               prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
-       else
-               prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
-
-       if (s.disk_label) {
-               prt_str(out, " label");
-               bch2_disk_path_to_text(out, c, s.disk_label - 1);
-       }
-
-       for (unsigned i = 0; i < s.nr_blocks; i++) {
-               const struct bch_extent_ptr *ptr = sp->ptrs + i;
-
-               if ((void *) ptr >= bkey_val_end(k))
-                       break;
-
-               prt_char(out, ' ');
-               bch2_extent_ptr_to_text(out, c, ptr);
-
-               if (s.csum_type < BCH_CSUM_NR &&
-                   i < nr_data &&
-                   stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
-                       prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
-       }
-}
-
-/* Triggers: */
-
-static int __mark_stripe_bucket(struct btree_trans *trans,
-                               struct bch_dev *ca,
-                               struct bkey_s_c_stripe s,
-                               unsigned ptr_idx, bool deleting,
-                               struct bpos bucket,
-                               struct bch_alloc_v4 *a,
-                               enum btree_iter_update_trigger_flags flags)
-{
-       const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
-       unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
-       bool parity = ptr_idx >= nr_data;
-       enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
-       s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       struct bch_fs *c = trans->c;
-       if (deleting)
-               sectors = -sectors;
-
-       if (!deleting) {
-               if (bch2_trans_inconsistent_on(a->stripe ||
-                                              a->stripe_redundancy, trans,
-                               "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
-                               bucket.inode, bucket.offset, a->gen,
-                               bch2_data_type_str(a->data_type),
-                               a->dirty_sectors,
-                               a->stripe, s.k->p.offset,
-                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-                       ret = bch_err_throw(c, mark_stripe);
-                       goto err;
-               }
-
-               if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
-                               "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
-                               bucket.inode, bucket.offset, a->gen,
-                               bch2_data_type_str(a->data_type),
-                               a->dirty_sectors,
-                               a->cached_sectors,
-                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-                       ret = bch_err_throw(c, mark_stripe);
-                       goto err;
-               }
-       } else {
-               if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
-                                              a->stripe_redundancy != s.v->nr_redundant, trans,
-                               "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
-                               bucket.inode, bucket.offset, a->gen,
-                               a->stripe,
-                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-                       ret = bch_err_throw(c, mark_stripe);
-                       goto err;
-               }
-
-               if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
-                               "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
-                               bucket.inode, bucket.offset, a->gen,
-                               bch2_data_type_str(a->data_type),
-                               bch2_data_type_str(data_type),
-                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-                       ret = bch_err_throw(c, mark_stripe);
-                       goto err;
-               }
-
-               if (bch2_trans_inconsistent_on(parity &&
-                                              (a->dirty_sectors != -sectors ||
-                                               a->cached_sectors), trans,
-                               "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
-                               bucket.inode, bucket.offset, a->gen,
-                               a->dirty_sectors,
-                               a->cached_sectors,
-                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-                       ret = bch_err_throw(c, mark_stripe);
-                       goto err;
-               }
-       }
-
-       if (sectors) {
-               ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
-                                            a->gen, a->data_type, &a->dirty_sectors);
-               if (ret)
-                       goto err;
-       }
-
-       if (!deleting) {
-               a->stripe               = s.k->p.offset;
-               a->stripe_redundancy    = s.v->nr_redundant;
-               alloc_data_type_set(a, data_type);
-       } else {
-               a->stripe               = 0;
-               a->stripe_redundancy    = 0;
-               alloc_data_type_set(a, BCH_DATA_user);
-       }
-err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int mark_stripe_bucket(struct btree_trans *trans,
-                             struct bkey_s_c_stripe s,
-                             unsigned ptr_idx, bool deleting,
-                             enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
-       if (unlikely(!ca)) {
-               if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
-                       ret = bch_err_throw(c, mark_stripe);
-               goto err;
-       }
-
-       struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-       if (flags & BTREE_TRIGGER_transactional) {
-               struct extent_ptr_decoded p = {
-                       .ptr = *ptr,
-                       .crc = bch2_extent_crc_unpack(s.k, NULL),
-               };
-               struct bkey_i_backpointer bp;
-               bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
-                                     (const union bch_extent_entry *) ptr, &bp);
-
-               struct bkey_i_alloc_v4 *a =
-                       bch2_trans_start_alloc_update(trans, bucket, 0);
-               ret   = PTR_ERR_OR_ZERO(a) ?:
-                       __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
-                       bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
-                                                   !(flags & BTREE_TRIGGER_overwrite));
-               if (ret)
-                       goto err;
-       }
-
-       if (flags & BTREE_TRIGGER_gc) {
-               struct bucket *g = gc_bucket(ca, bucket.offset);
-               if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s",
-                                           ptr->dev,
-                                           (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-                       ret = bch_err_throw(c, mark_stripe);
-                       goto err;
-               }
-
-               bucket_lock(g);
-               struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
-               ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
-               alloc_to_bucket(g, new);
-               bucket_unlock(g);
-
-               if (!ret)
-                       ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
-       }
-err:
-       bch2_dev_put(ca);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int mark_stripe_buckets(struct btree_trans *trans,
-                              struct bkey_s_c old, struct bkey_s_c new,
-                              enum btree_iter_update_trigger_flags flags)
-{
-       const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
-               ? bkey_s_c_to_stripe(old).v : NULL;
-       const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
-               ? bkey_s_c_to_stripe(new).v : NULL;
-
-       BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
-
-       unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
-
-       for (unsigned i = 0; i < nr_blocks; i++) {
-               if (new_s && old_s &&
-                   !memcmp(&new_s->ptrs[i],
-                           &old_s->ptrs[i],
-                           sizeof(new_s->ptrs[i])))
-                       continue;
-
-               if (new_s) {
-                       int ret = mark_stripe_bucket(trans,
-                                       bkey_s_c_to_stripe(new), i, false, flags);
-                       if (ret)
-                               return ret;
-               }
-
-               if (old_s) {
-                       int ret = mark_stripe_bucket(trans,
-                                       bkey_s_c_to_stripe(old), i, true, flags);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       return 0;
-}
-
-int bch2_trigger_stripe(struct btree_trans *trans,
-                       enum btree_id btree, unsigned level,
-                       struct bkey_s_c old, struct bkey_s _new,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       struct bkey_s_c new = _new.s_c;
-       struct bch_fs *c = trans->c;
-       u64 idx = new.k->p.offset;
-       const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
-               ? bkey_s_c_to_stripe(old).v : NULL;
-       const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
-               ? bkey_s_c_to_stripe(new).v : NULL;
-
-       if (unlikely(flags & BTREE_TRIGGER_check_repair))
-               return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
-
-       BUG_ON(new_s && old_s &&
-              (new_s->nr_blocks        != old_s->nr_blocks ||
-               new_s->nr_redundant     != old_s->nr_redundant));
-
-       if (flags & BTREE_TRIGGER_transactional) {
-               int ret = bch2_lru_change(trans,
-                                         BCH_LRU_STRIPE_FRAGMENTATION,
-                                         idx,
-                                         stripe_lru_pos(old_s),
-                                         stripe_lru_pos(new_s));
-               if (ret)
-                       return ret;
-       }
-
-       if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
-               /*
-                * If the pointers aren't changing, we don't need to do anything:
-                */
-               if (new_s && old_s &&
-                   new_s->nr_blocks    == old_s->nr_blocks &&
-                   new_s->nr_redundant == old_s->nr_redundant &&
-                   !memcmp(old_s->ptrs, new_s->ptrs,
-                           new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
-                       return 0;
-
-               struct gc_stripe *gc = NULL;
-               if (flags & BTREE_TRIGGER_gc) {
-                       gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
-                       if (!gc) {
-                               bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
-                               return bch_err_throw(c, ENOMEM_mark_stripe);
-                       }
-
-                       /*
-                        * This will be wrong when we bring back runtime gc: we should
-                        * be unmarking the old key and then marking the new key
-                        *
-                        * Also: when we bring back runtime gc, locking
-                        */
-                       gc->alive       = true;
-                       gc->sectors     = le16_to_cpu(new_s->sectors);
-                       gc->nr_blocks   = new_s->nr_blocks;
-                       gc->nr_redundant        = new_s->nr_redundant;
-
-                       for (unsigned i = 0; i < new_s->nr_blocks; i++)
-                               gc->ptrs[i] = new_s->ptrs[i];
-
-                       /*
-                        * gc recalculates this field from stripe ptr
-                        * references:
-                        */
-                       memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
-               }
-
-               if (new_s) {
-                       s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
-
-                       struct disk_accounting_pos acc;
-                       memset(&acc, 0, sizeof(acc));
-                       acc.type = BCH_DISK_ACCOUNTING_replicas;
-                       bch2_bkey_to_replicas(&acc.replicas, new);
-                       int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
-                       if (ret)
-                               return ret;
-
-                       if (gc)
-                               unsafe_memcpy(&gc->r.e, &acc.replicas,
-                                             replicas_entry_bytes(&acc.replicas), "VLA");
-               }
-
-               if (old_s) {
-                       s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
-
-                       struct disk_accounting_pos acc;
-                       memset(&acc, 0, sizeof(acc));
-                       acc.type = BCH_DISK_ACCOUNTING_replicas;
-                       bch2_bkey_to_replicas(&acc.replicas, old);
-                       int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
-                       if (ret)
-                               return ret;
-               }
-
-               int ret = mark_stripe_buckets(trans, old, new, flags);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-/* returns blocknr in stripe that we matched: */
-static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
-                                               struct bkey_s_c k, unsigned *block)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
-
-       bkey_for_each_ptr(ptrs, ptr)
-               for (i = 0; i < nr_data; i++)
-                       if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
-                                                     le16_to_cpu(s->sectors))) {
-                               *block = i;
-                               return ptr;
-                       }
-
-       return NULL;
-}
-
-static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-
-       bkey_extent_entry_for_each(ptrs, entry)
-               if (extent_entry_type(entry) ==
-                   BCH_EXTENT_ENTRY_stripe_ptr &&
-                   entry->stripe_ptr.idx == idx)
-                       return true;
-
-       return false;
-}
-
-/* Stripe bufs: */
-
-static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
-{
-       if (buf->key.k.type == KEY_TYPE_stripe) {
-               struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
-               unsigned i;
-
-               for (i = 0; i < s->v.nr_blocks; i++) {
-                       kvfree(buf->data[i]);
-                       buf->data[i] = NULL;
-               }
-       }
-}
-
-/* XXX: this is a non-mempoolified memory allocation: */
-static int ec_stripe_buf_init(struct bch_fs *c,
-                             struct ec_stripe_buf *buf,
-                             unsigned offset, unsigned size)
-{
-       struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-       unsigned csum_granularity = 1U << v->csum_granularity_bits;
-       unsigned end = offset + size;
-       unsigned i;
-
-       BUG_ON(end > le16_to_cpu(v->sectors));
-
-       offset  = round_down(offset, csum_granularity);
-       end     = min_t(unsigned, le16_to_cpu(v->sectors),
-                       round_up(end, csum_granularity));
-
-       buf->offset     = offset;
-       buf->size       = end - offset;
-
-       memset(buf->valid, 0xFF, sizeof(buf->valid));
-
-       for (i = 0; i < v->nr_blocks; i++) {
-               buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
-               if (!buf->data[i])
-                       goto err;
-       }
-
-       return 0;
-err:
-       ec_stripe_buf_exit(buf);
-       return bch_err_throw(c, ENOMEM_stripe_buf);
-}
-
-/* Checksumming: */
-
-static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
-                                        unsigned block, unsigned offset)
-{
-       struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-       unsigned csum_granularity = 1 << v->csum_granularity_bits;
-       unsigned end = buf->offset + buf->size;
-       unsigned len = min(csum_granularity, end - offset);
-
-       BUG_ON(offset >= end);
-       BUG_ON(offset <  buf->offset);
-       BUG_ON(offset & (csum_granularity - 1));
-       BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
-              (len & (csum_granularity - 1)));
-
-       return bch2_checksum(NULL, v->csum_type,
-                            null_nonce(),
-                            buf->data[block] + ((offset - buf->offset) << 9),
-                            len << 9);
-}
-
-static void ec_generate_checksums(struct ec_stripe_buf *buf)
-{
-       struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-       unsigned i, j, csums_per_device = stripe_csums_per_device(v);
-
-       if (!v->csum_type)
-               return;
-
-       BUG_ON(buf->offset);
-       BUG_ON(buf->size != le16_to_cpu(v->sectors));
-
-       for (i = 0; i < v->nr_blocks; i++)
-               for (j = 0; j < csums_per_device; j++)
-                       stripe_csum_set(v, i, j,
-                               ec_block_checksum(buf, i, j << v->csum_granularity_bits));
-}
-
-static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
-{
-       struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-       unsigned csum_granularity = 1 << v->csum_granularity_bits;
-       unsigned i;
-
-       if (!v->csum_type)
-               return;
-
-       for (i = 0; i < v->nr_blocks; i++) {
-               unsigned offset = buf->offset;
-               unsigned end = buf->offset + buf->size;
-
-               if (!test_bit(i, buf->valid))
-                       continue;
-
-               while (offset < end) {
-                       unsigned j = offset >> v->csum_granularity_bits;
-                       unsigned len = min(csum_granularity, end - offset);
-                       struct bch_csum want = stripe_csum_get(v, i, j);
-                       struct bch_csum got = ec_block_checksum(buf, i, offset);
-
-                       if (bch2_crc_cmp(want, got)) {
-                               struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
-                               if (ca) {
-                                       struct printbuf err = PRINTBUF;
-
-                                       prt_str(&err, "stripe ");
-                                       bch2_csum_err_msg(&err, v->csum_type, want, got);
-                                       prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
-                                       bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
-                                       bch_err_ratelimited(ca, "%s", err.buf);
-                                       printbuf_exit(&err);
-
-                                       bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-                               }
-
-                               clear_bit(i, buf->valid);
-                               break;
-                       }
-
-                       offset += len;
-               }
-       }
-}
-
-/* Erasure coding: */
-
-static void ec_generate_ec(struct ec_stripe_buf *buf)
-{
-       struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-       unsigned nr_data = v->nr_blocks - v->nr_redundant;
-       unsigned bytes = le16_to_cpu(v->sectors) << 9;
-
-       raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
-}
-
-static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
-{
-       struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-
-       return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
-}
-
-static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
-{
-       struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-       unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
-       unsigned nr_data = v->nr_blocks - v->nr_redundant;
-       unsigned bytes = buf->size << 9;
-
-       if (ec_nr_failed(buf) > v->nr_redundant) {
-               bch_err_ratelimited(c,
-                       "error doing reconstruct read: unable to read enough blocks");
-               return -1;
-       }
-
-       for (i = 0; i < nr_data; i++)
-               if (!test_bit(i, buf->valid))
-                       failed[nr_failed++] = i;
-
-       raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
-       return 0;
-}
-
-/* IO: */
-
-static void ec_block_endio(struct bio *bio)
-{
-       struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
-       struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
-       struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
-       struct bch_dev *ca = ec_bio->ca;
-       struct closure *cl = bio->bi_private;
-       int rw = ec_bio->rw;
-       unsigned ref = rw == READ
-               ? BCH_DEV_READ_REF_ec_block
-               : BCH_DEV_WRITE_REF_ec_block;
-
-       bch2_account_io_completion(ca, bio_data_dir(bio),
-                                  ec_bio->submit_time, !bio->bi_status);
-
-       if (bio->bi_status) {
-               bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
-                              str_write_read(bio_data_dir(bio)),
-                              bch2_blk_status_to_str(bio->bi_status));
-               clear_bit(ec_bio->idx, ec_bio->buf->valid);
-       }
-
-       int stale = dev_ptr_stale(ca, ptr);
-       if (stale) {
-               bch_err_ratelimited(ca->fs,
-                                   "error %s stripe: stale/invalid pointer (%i) after io",
-                                   bio_data_dir(bio) == READ ? "reading from" : "writing to",
-                                   stale);
-               clear_bit(ec_bio->idx, ec_bio->buf->valid);
-       }
-
-       bio_put(&ec_bio->bio);
-       enumerated_ref_put(&ca->io_ref[rw], ref);
-       closure_put(cl);
-}
-
-static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
-                       blk_opf_t opf, unsigned idx, struct closure *cl)
-{
-       struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-       unsigned offset = 0, bytes = buf->size << 9;
-       struct bch_extent_ptr *ptr = &v->ptrs[idx];
-       enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
-               ? BCH_DATA_user
-               : BCH_DATA_parity;
-       int rw = op_is_write(opf);
-       unsigned ref = rw == READ
-               ? BCH_DEV_READ_REF_ec_block
-               : BCH_DEV_WRITE_REF_ec_block;
-
-       struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref);
-       if (!ca) {
-               clear_bit(idx, buf->valid);
-               return;
-       }
-
-       int stale = dev_ptr_stale(ca, ptr);
-       if (stale) {
-               bch_err_ratelimited(c,
-                                   "error %s stripe: stale pointer (%i)",
-                                   rw == READ ? "reading from" : "writing to",
-                                   stale);
-               clear_bit(idx, buf->valid);
-               return;
-       }
-
-
-       this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
-
-       while (offset < bytes) {
-               unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
-                                          DIV_ROUND_UP(bytes, PAGE_SIZE));
-               unsigned b = min_t(size_t, bytes - offset,
-                                  nr_iovecs << PAGE_SHIFT);
-               struct ec_bio *ec_bio;
-
-               ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
-                                                      nr_iovecs,
-                                                      opf,
-                                                      GFP_KERNEL,
-                                                      &c->ec_bioset),
-                                     struct ec_bio, bio);
-
-               ec_bio->ca                      = ca;
-               ec_bio->buf                     = buf;
-               ec_bio->idx                     = idx;
-               ec_bio->rw                      = rw;
-               ec_bio->submit_time             = local_clock();
-
-               ec_bio->bio.bi_iter.bi_sector   = ptr->offset + buf->offset + (offset >> 9);
-               ec_bio->bio.bi_end_io           = ec_block_endio;
-               ec_bio->bio.bi_private          = cl;
-
-               bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
-
-               closure_get(cl);
-               enumerated_ref_get(&ca->io_ref[rw], ref);
-
-               submit_bio(&ec_bio->bio);
-
-               offset += b;
-       }
-
-       enumerated_ref_put(&ca->io_ref[rw], ref);
-}
-
-static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
-                               struct ec_stripe_buf *stripe)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
-                              POS(0, idx), BTREE_ITER_slots);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-       if (k.k->type != KEY_TYPE_stripe) {
-               ret = -ENOENT;
-               goto err;
-       }
-       bkey_reassemble(&stripe->key, k);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/* recovery read path: */
-int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
-                       struct bkey_s_c orig_k)
-{
-       struct bch_fs *c = trans->c;
-       struct ec_stripe_buf *buf = NULL;
-       struct closure cl;
-       struct bch_stripe *v;
-       unsigned i, offset;
-       const char *msg = NULL;
-       struct printbuf msgbuf = PRINTBUF;
-       int ret = 0;
-
-       closure_init_stack(&cl);
-
-       BUG_ON(!rbio->pick.has_ec);
-
-       buf = kzalloc(sizeof(*buf), GFP_NOFS);
-       if (!buf)
-               return bch_err_throw(c, ENOMEM_ec_read_extent);
-
-       ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
-       if (ret) {
-               msg = "stripe not found";
-               goto err;
-       }
-
-       v = &bkey_i_to_stripe(&buf->key)->v;
-
-       if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
-               msg = "pointer doesn't match stripe";
-               goto err;
-       }
-
-       offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
-       if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
-               msg = "read is bigger than stripe";
-               goto err;
-       }
-
-       ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio));
-       if (ret) {
-               msg = "-ENOMEM";
-               goto err;
-       }
-
-       for (i = 0; i < v->nr_blocks; i++)
-               ec_block_io(c, buf, REQ_OP_READ, i, &cl);
-
-       closure_sync(&cl);
-
-       if (ec_nr_failed(buf) > v->nr_redundant) {
-               msg = "unable to read enough blocks";
-               goto err;
-       }
-
-       ec_validate_checksums(c, buf);
-
-       ret = ec_do_recov(c, buf);
-       if (ret)
-               goto err;
-
-       memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
-                     buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
-out:
-       ec_stripe_buf_exit(buf);
-       kfree(buf);
-       return ret;
-err:
-       bch2_bkey_val_to_text(&msgbuf, c, orig_k);
-       bch_err_ratelimited(c,
-                           "error doing reconstruct read: %s\n  %s", msg, msgbuf.buf);
-       printbuf_exit(&msgbuf);
-       ret = bch_err_throw(c, stripe_reconstruct);
-       goto out;
-}
-
-/* stripe bucket accounting: */
-
-static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
-{
-       if (c->gc_pos.phase != GC_PHASE_not_running &&
-           !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
-               return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc);
-
-       return 0;
-}
-
-static int ec_stripe_mem_alloc(struct btree_trans *trans,
-                              struct btree_iter *iter)
-{
-       return allocate_dropping_locks_errcode(trans,
-                       __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
-}
-
-/*
- * Hash table of open stripes:
- * Stripes that are being created or modified are kept in a hash table, so that
- * stripe deletion can skip them.
- */
-
-static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
-{
-       unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
-       struct ec_stripe_new *s;
-
-       hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
-               if (s->idx == idx)
-                       return true;
-       return false;
-}
-
-static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
-{
-       bool ret = false;
-
-       spin_lock(&c->ec_stripes_new_lock);
-       ret = __bch2_stripe_is_open(c, idx);
-       spin_unlock(&c->ec_stripes_new_lock);
-
-       return ret;
-}
-
-static bool bch2_try_open_stripe(struct bch_fs *c,
-                                struct ec_stripe_new *s,
-                                u64 idx)
-{
-       bool ret;
-
-       spin_lock(&c->ec_stripes_new_lock);
-       ret = !__bch2_stripe_is_open(c, idx);
-       if (ret) {
-               unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
-
-               s->idx = idx;
-               hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
-       }
-       spin_unlock(&c->ec_stripes_new_lock);
-
-       return ret;
-}
-
-static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
-{
-       BUG_ON(!s->idx);
-
-       spin_lock(&c->ec_stripes_new_lock);
-       hlist_del_init(&s->hash);
-       spin_unlock(&c->ec_stripes_new_lock);
-
-       s->idx = 0;
-}
-
-/* stripe deletion */
-
-static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
-                                              BTREE_ID_stripes, POS(0, idx),
-                                              BTREE_ITER_intent);
-       int ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       /*
-        * We expect write buffer races here
-        * Important: check stripe_is_open with stripe key locked:
-        */
-       if (k.k->type == KEY_TYPE_stripe &&
-           !bch2_stripe_is_open(trans->c, idx) &&
-           stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
-               ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/*
- * XXX
- * can we kill this and delete stripes from the trigger?
- */
-static void ec_stripe_delete_work(struct work_struct *work)
-{
-       struct bch_fs *c =
-               container_of(work, struct bch_fs, ec_stripe_delete_work);
-
-       bch2_trans_run(c,
-               bch2_btree_write_buffer_tryflush(trans) ?:
-               for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
-                               lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
-                               lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
-                               0, lru_k,
-                               NULL, NULL,
-                               BCH_TRANS_COMMIT_no_enospc, ({
-                       ec_stripe_delete(trans, lru_k.k->p.offset);
-               })));
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
-}
-
-void bch2_do_stripe_deletes(struct bch_fs *c)
-{
-       if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) &&
-           !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
-               enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
-}
-
-/* stripe creation: */
-
-static int ec_stripe_key_update(struct btree_trans *trans,
-                               struct bkey_i_stripe *old,
-                               struct bkey_i_stripe *new)
-{
-       struct bch_fs *c = trans->c;
-       bool create = !old;
-
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
-                                              new->k.p, BTREE_ITER_intent);
-       int ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
-                                   c, "error %s stripe: got existing key type %s",
-                                   create ? "creating" : "updating",
-                                   bch2_bkey_types[k.k->type])) {
-               ret = -EINVAL;
-               goto err;
-       }
-
-       if (k.k->type == KEY_TYPE_stripe) {
-               const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
-
-               BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
-               BUG_ON(old->v.nr_blocks != v->nr_blocks);
-
-               for (unsigned i = 0; i < new->v.nr_blocks; i++) {
-                       unsigned sectors = stripe_blockcount_get(v, i);
-
-                       if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
-                               struct printbuf buf = PRINTBUF;
-
-                               prt_printf(&buf, "stripe changed nonempty block %u", i);
-                               prt_str(&buf, "\nold: ");
-                               bch2_bkey_val_to_text(&buf, c, k);
-                               prt_str(&buf, "\nnew: ");
-                               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
-                               bch2_fs_inconsistent(c, "%s", buf.buf);
-                               printbuf_exit(&buf);
-                               ret = -EINVAL;
-                               goto err;
-                       }
-
-                       /*
-                        * If the stripe ptr changed underneath us, it must have
-                        * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
-                        */
-                       if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
-                               BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
-
-                               if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
-                                       new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
-                       }
-
-                       stripe_blockcount_set(&new->v, i, sectors);
-               }
-       }
-
-       ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int ec_stripe_update_extent(struct btree_trans *trans,
-                                  struct bch_dev *ca,
-                                  struct bpos bucket, u8 gen,
-                                  struct ec_stripe_buf *s,
-                                  struct bkey_s_c_backpointer bp,
-                                  struct bkey_buf *last_flushed)
-{
-       struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       const struct bch_extent_ptr *ptr_c;
-       struct bch_extent_ptr *ec_ptr = NULL;
-       struct bch_extent_stripe_ptr stripe_ptr;
-       struct bkey_i *n;
-       int ret, dev, block;
-
-       if (bp.v->level) {
-               struct printbuf buf = PRINTBUF;
-               struct btree_iter node_iter;
-               struct btree *b;
-
-               b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed);
-               bch2_trans_iter_exit(trans, &node_iter);
-
-               if (!b)
-                       return 0;
-
-               prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
-               bch2_bkey_val_to_text(&buf, c, bp.s_c);
-
-               bch2_fs_inconsistent(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-               return bch_err_throw(c, erasure_coding_found_btree_node);
-       }
-
-       k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-       if (!k.k) {
-               /*
-                * extent no longer exists - we could flush the btree
-                * write buffer and retry to verify, but no need:
-                */
-               return 0;
-       }
-
-       if (extent_has_stripe_ptr(k, s->key.k.p.offset))
-               goto out;
-
-       ptr_c = bkey_matches_stripe(v, k, &block);
-       /*
-        * It doesn't generally make sense to erasure code cached ptrs:
-        * XXX: should we be incrementing a counter?
-        */
-       if (!ptr_c || ptr_c->cached)
-               goto out;
-
-       dev = v->ptrs[block].dev;
-
-       n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
-       ret = PTR_ERR_OR_ZERO(n);
-       if (ret)
-               goto out;
-
-       bkey_reassemble(n, k);
-
-       bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
-       ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
-       BUG_ON(!ec_ptr);
-
-       stripe_ptr = (struct bch_extent_stripe_ptr) {
-               .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
-               .block          = block,
-               .redundancy     = v->nr_redundant,
-               .idx            = s->key.k.p.offset,
-       };
-
-       __extent_entry_insert(n,
-                       (union bch_extent_entry *) ec_ptr,
-                       (union bch_extent_entry *) &stripe_ptr);
-
-       ret = bch2_trans_update(trans, &iter, n, 0);
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
-                                  unsigned block)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
-       struct bch_extent_ptr ptr = v->ptrs[block];
-       int ret = 0;
-
-       struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
-       if (!ca)
-               return bch_err_throw(c, ENOENT_dev_not_found);
-
-       struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
-
-       struct bkey_buf last_flushed;
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-
-       ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
-                       bucket_pos_to_bp_start(ca, bucket_pos),
-                       bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
-                       NULL, NULL,
-                       BCH_TRANS_COMMIT_no_check_rw|
-                       BCH_TRANS_COMMIT_no_enospc, ({
-               if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
-                       break;
-
-               if (bp_k.k->type != KEY_TYPE_backpointer)
-                       continue;
-
-               struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
-               if (bp.v->btree_id == BTREE_ID_stripes)
-                       continue;
-
-               ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
-                                       bp, &last_flushed);
-       }));
-
-       bch2_bkey_buf_exit(&last_flushed, c);
-       bch2_dev_put(ca);
-       return ret;
-}
-
-static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
-       unsigned nr_data = v->nr_blocks - v->nr_redundant;
-
-       int ret = bch2_btree_write_buffer_flush_sync(trans);
-       if (ret)
-               goto err;
-
-       for (unsigned i = 0; i < nr_data; i++) {
-               ret = ec_stripe_update_bucket(trans, s, i);
-               if (ret)
-                       break;
-       }
-err:
-       bch2_trans_put(trans);
-       return ret;
-}
-
-static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
-                                      struct ec_stripe_new *s,
-                                      unsigned block,
-                                      struct open_bucket *ob)
-{
-       struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE,
-                               BCH_DEV_WRITE_REF_ec_bucket_zero);
-       if (!ca) {
-               s->err = bch_err_throw(c, erofs_no_writes);
-               return;
-       }
-
-       unsigned offset = ca->mi.bucket_size - ob->sectors_free;
-       memset(s->new_stripe.data[block] + (offset << 9),
-              0,
-              ob->sectors_free << 9);
-
-       int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
-                       ob->bucket * ca->mi.bucket_size + offset,
-                       ob->sectors_free,
-                       GFP_KERNEL, 0);
-
-       enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero);
-
-       if (ret)
-               s->err = ret;
-}
-
-void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
-{
-       if (s->idx)
-               bch2_stripe_close(c, s);
-       kfree(s);
-}
-
-/*
- * data buckets of new stripe all written: create the stripe
- */
-static void ec_stripe_create(struct ec_stripe_new *s)
-{
-       struct bch_fs *c = s->c;
-       struct open_bucket *ob;
-       struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
-       unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
-       int ret;
-
-       BUG_ON(s->h->s == s);
-
-       closure_sync(&s->iodone);
-
-       if (!s->err) {
-               for (i = 0; i < nr_data; i++)
-                       if (s->blocks[i]) {
-                               ob = c->open_buckets + s->blocks[i];
-
-                               if (ob->sectors_free)
-                                       zero_out_rest_of_ec_bucket(c, s, i, ob);
-                       }
-       }
-
-       if (s->err) {
-               if (!bch2_err_matches(s->err, EROFS))
-                       bch_err(c, "error creating stripe: error writing data buckets");
-               ret = s->err;
-               goto err;
-       }
-
-       if (s->have_existing_stripe) {
-               ec_validate_checksums(c, &s->existing_stripe);
-
-               if (ec_do_recov(c, &s->existing_stripe)) {
-                       bch_err(c, "error creating stripe: error reading existing stripe");
-                       ret = bch_err_throw(c, ec_block_read);
-                       goto err;
-               }
-
-               for (i = 0; i < nr_data; i++)
-                       if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
-                               swap(s->new_stripe.data[i],
-                                    s->existing_stripe.data[i]);
-
-               ec_stripe_buf_exit(&s->existing_stripe);
-       }
-
-       BUG_ON(!s->allocated);
-       BUG_ON(!s->idx);
-
-       ec_generate_ec(&s->new_stripe);
-
-       ec_generate_checksums(&s->new_stripe);
-
-       /* write p/q: */
-       for (i = nr_data; i < v->nr_blocks; i++)
-               ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
-       closure_sync(&s->iodone);
-
-       if (ec_nr_failed(&s->new_stripe)) {
-               bch_err(c, "error creating stripe: error writing redundancy buckets");
-               ret = bch_err_throw(c, ec_block_write);
-               goto err;
-       }
-
-       ret = bch2_trans_commit_do(c, &s->res, NULL,
-               BCH_TRANS_COMMIT_no_check_rw|
-               BCH_TRANS_COMMIT_no_enospc,
-               ec_stripe_key_update(trans,
-                                    s->have_existing_stripe
-                                    ? bkey_i_to_stripe(&s->existing_stripe.key)
-                                    : NULL,
-                                    bkey_i_to_stripe(&s->new_stripe.key)));
-       bch_err_msg(c, ret, "creating stripe key");
-       if (ret) {
-               goto err;
-       }
-
-       ret = ec_stripe_update_extents(c, &s->new_stripe);
-       bch_err_msg(c, ret, "error updating extents");
-       if (ret)
-               goto err;
-err:
-       trace_stripe_create(c, s->idx, ret);
-
-       bch2_disk_reservation_put(c, &s->res);
-
-       for (i = 0; i < v->nr_blocks; i++)
-               if (s->blocks[i]) {
-                       ob = c->open_buckets + s->blocks[i];
-
-                       if (i < nr_data) {
-                               ob->ec = NULL;
-                               __bch2_open_bucket_put(c, ob);
-                       } else {
-                               bch2_open_bucket_put(c, ob);
-                       }
-               }
-
-       mutex_lock(&c->ec_stripe_new_lock);
-       list_del(&s->list);
-       mutex_unlock(&c->ec_stripe_new_lock);
-       wake_up(&c->ec_stripe_new_wait);
-
-       ec_stripe_buf_exit(&s->existing_stripe);
-       ec_stripe_buf_exit(&s->new_stripe);
-       closure_debug_destroy(&s->iodone);
-
-       ec_stripe_new_put(c, s, STRIPE_REF_stripe);
-}
-
-static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
-{
-       struct ec_stripe_new *s;
-
-       mutex_lock(&c->ec_stripe_new_lock);
-       list_for_each_entry(s, &c->ec_stripe_new_list, list)
-               if (!atomic_read(&s->ref[STRIPE_REF_io]))
-                       goto out;
-       s = NULL;
-out:
-       mutex_unlock(&c->ec_stripe_new_lock);
-
-       return s;
-}
-
-static void ec_stripe_create_work(struct work_struct *work)
-{
-       struct bch_fs *c = container_of(work,
-               struct bch_fs, ec_stripe_create_work);
-       struct ec_stripe_new *s;
-
-       while ((s = get_pending_stripe(c)))
-               ec_stripe_create(s);
-
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
-}
-
-void bch2_ec_do_stripe_creates(struct bch_fs *c)
-{
-       enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create);
-
-       if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
-               enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
-}
-
-static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
-{
-       struct ec_stripe_new *s = h->s;
-
-       lockdep_assert_held(&h->lock);
-
-       BUG_ON(!s->allocated && !s->err);
-
-       h->s            = NULL;
-       s->pending      = true;
-
-       mutex_lock(&c->ec_stripe_new_lock);
-       list_add(&s->list, &c->ec_stripe_new_list);
-       mutex_unlock(&c->ec_stripe_new_lock);
-
-       ec_stripe_new_put(c, s, STRIPE_REF_io);
-}
-
-static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
-{
-       h->s->err = err;
-       ec_stripe_new_set_pending(c, h);
-}
-
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
-{
-       struct ec_stripe_new *s = ob->ec;
-
-       s->err = err;
-}
-
-void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
-{
-       struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
-       if (!ob)
-               return NULL;
-
-       BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
-
-       struct bch_dev *ca      = ob_dev(c, ob);
-       unsigned offset         = ca->mi.bucket_size - ob->sectors_free;
-
-       return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
-}
-
-static int unsigned_cmp(const void *_l, const void *_r)
-{
-       unsigned l = *((const unsigned *) _l);
-       unsigned r = *((const unsigned *) _r);
-
-       return cmp_int(l, r);
-}
-
-/* pick most common bucket size: */
-static unsigned pick_blocksize(struct bch_fs *c,
-                              struct bch_devs_mask *devs)
-{
-       unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
-       struct {
-               unsigned nr, size;
-       } cur = { 0, 0 }, best = { 0, 0 };
-
-       for_each_member_device_rcu(c, ca, devs)
-               sizes[nr++] = ca->mi.bucket_size;
-
-       sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
-
-       for (unsigned i = 0; i < nr; i++) {
-               if (sizes[i] != cur.size) {
-                       if (cur.nr > best.nr)
-                               best = cur;
-
-                       cur.nr = 0;
-                       cur.size = sizes[i];
-               }
-
-               cur.nr++;
-       }
-
-       if (cur.nr > best.nr)
-               best = cur;
-
-       return best.size;
-}
-
-static bool may_create_new_stripe(struct bch_fs *c)
-{
-       return false;
-}
-
-static void ec_stripe_key_init(struct bch_fs *c,
-                              struct bkey_i *k,
-                              unsigned nr_data,
-                              unsigned nr_parity,
-                              unsigned stripe_size,
-                              unsigned disk_label)
-{
-       struct bkey_i_stripe *s = bkey_stripe_init(k);
-       unsigned u64s;
-
-       s->v.sectors                    = cpu_to_le16(stripe_size);
-       s->v.algorithm                  = 0;
-       s->v.nr_blocks                  = nr_data + nr_parity;
-       s->v.nr_redundant               = nr_parity;
-       s->v.csum_granularity_bits      = ilog2(c->opts.encoded_extent_max >> 9);
-       s->v.csum_type                  = BCH_CSUM_crc32c;
-       s->v.disk_label                 = disk_label;
-
-       while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
-               BUG_ON(1 << s->v.csum_granularity_bits >=
-                      le16_to_cpu(s->v.sectors) ||
-                      s->v.csum_granularity_bits == U8_MAX);
-               s->v.csum_granularity_bits++;
-       }
-
-       set_bkey_val_u64s(&s->k, u64s);
-}
-
-static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
-{
-       struct ec_stripe_new *s;
-
-       lockdep_assert_held(&h->lock);
-
-       s = kzalloc(sizeof(*s), GFP_KERNEL);
-       if (!s)
-               return NULL;
-
-       mutex_init(&s->lock);
-       closure_init(&s->iodone, NULL);
-       atomic_set(&s->ref[STRIPE_REF_stripe], 1);
-       atomic_set(&s->ref[STRIPE_REF_io], 1);
-       s->c            = c;
-       s->h            = h;
-       s->nr_data      = min_t(unsigned, h->nr_active_devs,
-                               BCH_BKEY_PTRS_MAX) - h->redundancy;
-       s->nr_parity    = h->redundancy;
-
-       ec_stripe_key_init(c, &s->new_stripe.key,
-                          s->nr_data, s->nr_parity,
-                          h->blocksize, h->disk_label);
-       return s;
-}
-
-static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
-{
-       struct bch_devs_mask devs = h->devs;
-       unsigned nr_devs, nr_devs_with_durability;
-
-       scoped_guard(rcu) {
-               h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
-                                        ? group_to_target(h->disk_label - 1)
-                                        : 0);
-               nr_devs = dev_mask_nr(&h->devs);
-
-               for_each_member_device_rcu(c, ca, &h->devs)
-                       if (!ca->mi.durability)
-                               __clear_bit(ca->dev_idx, h->devs.d);
-               nr_devs_with_durability = dev_mask_nr(&h->devs);
-
-               h->blocksize = pick_blocksize(c, &h->devs);
-
-               h->nr_active_devs = 0;
-               for_each_member_device_rcu(c, ca, &h->devs)
-                       if (ca->mi.bucket_size == h->blocksize)
-                               h->nr_active_devs++;
-       }
-
-       /*
-        * If we only have redundancy + 1 devices, we're better off with just
-        * replication:
-        */
-       h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
-
-       if (h->insufficient_devs) {
-               const char *err;
-
-               if (nr_devs < h->redundancy + 2)
-                       err = NULL;
-               else if (nr_devs_with_durability < h->redundancy + 2)
-                       err = "cannot use durability=0 devices";
-               else
-                       err = "mismatched bucket sizes";
-
-               if (err)
-                       bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
-                               h->nr_active_devs, h->redundancy + 2, err);
-       }
-
-       struct bch_devs_mask devs_leaving;
-       bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
-
-       if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
-               ec_stripe_new_cancel(c, h, -EINTR);
-
-       h->rw_devs_change_count = c->rw_devs_change_count;
-}
-
-static struct ec_stripe_head *
-ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
-                        unsigned algo, unsigned redundancy,
-                        enum bch_watermark watermark)
-{
-       struct ec_stripe_head *h;
-
-       h = kzalloc(sizeof(*h), GFP_KERNEL);
-       if (!h)
-               return NULL;
-
-       mutex_init(&h->lock);
-       BUG_ON(!mutex_trylock(&h->lock));
-
-       h->disk_label   = disk_label;
-       h->algo         = algo;
-       h->redundancy   = redundancy;
-       h->watermark    = watermark;
-
-       list_add(&h->list, &c->ec_stripe_head_list);
-       return h;
-}
-
-void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
-{
-       if (h->s &&
-           h->s->allocated &&
-           bitmap_weight(h->s->blocks_allocated,
-                         h->s->nr_data) == h->s->nr_data)
-               ec_stripe_new_set_pending(c, h);
-
-       mutex_unlock(&h->lock);
-}
-
-static struct ec_stripe_head *
-__bch2_ec_stripe_head_get(struct btree_trans *trans,
-                         unsigned disk_label,
-                         unsigned algo,
-                         unsigned redundancy,
-                         enum bch_watermark watermark)
-{
-       struct bch_fs *c = trans->c;
-       struct ec_stripe_head *h;
-       int ret;
-
-       if (!redundancy)
-               return NULL;
-
-       ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
-       if (ret)
-               return ERR_PTR(ret);
-
-       if (test_bit(BCH_FS_going_ro, &c->flags)) {
-               h = ERR_PTR(-BCH_ERR_erofs_no_writes);
-               goto err;
-       }
-
-       list_for_each_entry(h, &c->ec_stripe_head_list, list)
-               if (h->disk_label       == disk_label &&
-                   h->algo             == algo &&
-                   h->redundancy       == redundancy &&
-                   h->watermark        == watermark) {
-                       ret = bch2_trans_mutex_lock(trans, &h->lock);
-                       if (ret) {
-                               h = ERR_PTR(ret);
-                               goto err;
-                       }
-                       goto found;
-               }
-
-       h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
-       if (!h) {
-               h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
-               goto err;
-       }
-found:
-       if (h->rw_devs_change_count != c->rw_devs_change_count)
-               ec_stripe_head_devs_update(c, h);
-
-       if (h->insufficient_devs) {
-               mutex_unlock(&h->lock);
-               h = NULL;
-       }
-err:
-       mutex_unlock(&c->ec_stripe_head_lock);
-       return h;
-}
-
-static int new_stripe_alloc_buckets(struct btree_trans *trans,
-                                   struct alloc_request *req,
-                                   struct ec_stripe_head *h, struct ec_stripe_new *s,
-                                   struct closure *cl)
-{
-       struct bch_fs *c = trans->c;
-       struct open_bucket *ob;
-       struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
-       unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
-       int ret = 0;
-
-       req->scratch_data_type          = req->data_type;
-       req->scratch_ptrs               = req->ptrs;
-       req->scratch_nr_replicas        = req->nr_replicas;
-       req->scratch_nr_effective       = req->nr_effective;
-       req->scratch_have_cache         = req->have_cache;
-       req->scratch_devs_may_alloc     = req->devs_may_alloc;
-
-       req->devs_may_alloc     = h->devs;
-       req->have_cache         = true;
-
-       BUG_ON(v->nr_blocks     != s->nr_data + s->nr_parity);
-       BUG_ON(v->nr_redundant  != s->nr_parity);
-
-       /* * We bypass the sector allocator which normally does this: */
-       bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d,
-                  c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
-
-       for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
-               /*
-                * Note: we don't yet repair invalid blocks (failed/removed
-                * devices) when reusing stripes - we still need a codepath to
-                * walk backpointers and update all extents that point to that
-                * block when updating the stripe
-                */
-               if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
-                       __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d);
-
-               if (i < s->nr_data)
-                       nr_have_data++;
-               else
-                       nr_have_parity++;
-       }
-
-       BUG_ON(nr_have_data     > s->nr_data);
-       BUG_ON(nr_have_parity   > s->nr_parity);
-
-       req->ptrs.nr = 0;
-       if (nr_have_parity < s->nr_parity) {
-               req->nr_replicas        = s->nr_parity;
-               req->nr_effective       = nr_have_parity;
-               req->data_type          = BCH_DATA_parity;
-
-               ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl);
-
-               open_bucket_for_each(c, &req->ptrs, ob, i) {
-                       j = find_next_zero_bit(s->blocks_gotten,
-                                              s->nr_data + s->nr_parity,
-                                              s->nr_data);
-                       BUG_ON(j >= s->nr_data + s->nr_parity);
-
-                       s->blocks[j] = req->ptrs.v[i];
-                       v->ptrs[j] = bch2_ob_ptr(c, ob);
-                       __set_bit(j, s->blocks_gotten);
-               }
-
-               if (ret)
-                       goto err;
-       }
-
-       req->ptrs.nr = 0;
-       if (nr_have_data < s->nr_data) {
-               req->nr_replicas        = s->nr_data;
-               req->nr_effective       = nr_have_data;
-               req->data_type          = BCH_DATA_user;
-
-               ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl);
-
-               open_bucket_for_each(c, &req->ptrs, ob, i) {
-                       j = find_next_zero_bit(s->blocks_gotten,
-                                              s->nr_data, 0);
-                       BUG_ON(j >= s->nr_data);
-
-                       s->blocks[j] = req->ptrs.v[i];
-                       v->ptrs[j] = bch2_ob_ptr(c, ob);
-                       __set_bit(j, s->blocks_gotten);
-               }
-
-               if (ret)
-                       goto err;
-       }
-err:
-       req->data_type          = req->scratch_data_type;
-       req->ptrs               = req->scratch_ptrs;
-       req->nr_replicas        = req->scratch_nr_replicas;
-       req->nr_effective       = req->scratch_nr_effective;
-       req->have_cache         = req->scratch_have_cache;
-       req->devs_may_alloc     = req->scratch_devs_may_alloc;
-       return ret;
-}
-
-static int __get_existing_stripe(struct btree_trans *trans,
-                                struct ec_stripe_head *head,
-                                struct ec_stripe_buf *stripe,
-                                u64 idx)
-{
-       struct bch_fs *c = trans->c;
-
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
-                                         BTREE_ID_stripes, POS(0, idx), 0);
-       int ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       /* We expect write buffer races here */
-       if (k.k->type != KEY_TYPE_stripe)
-               goto out;
-
-       struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-       if (stripe_lru_pos(s.v) <= 1)
-               goto out;
-
-       if (s.v->disk_label             == head->disk_label &&
-           s.v->algorithm              == head->algo &&
-           s.v->nr_redundant           == head->redundancy &&
-           le16_to_cpu(s.v->sectors)   == head->blocksize &&
-           bch2_try_open_stripe(c, head->s, idx)) {
-               bkey_reassemble(&stripe->key, k);
-               ret = 1;
-       }
-out:
-       bch2_set_btree_iter_dontneed(trans, &iter);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
-{
-       struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
-       struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
-       unsigned i;
-
-       BUG_ON(existing_v->nr_redundant != s->nr_parity);
-       s->nr_data = existing_v->nr_blocks -
-               existing_v->nr_redundant;
-
-       int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
-       if (ret) {
-               bch2_stripe_close(c, s);
-               return ret;
-       }
-
-       BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
-
-       /*
-        * Free buckets we initially allocated - they might conflict with
-        * blocks from the stripe we're reusing:
-        */
-       for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
-               bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
-               s->blocks[i] = 0;
-       }
-       memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
-       memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
-
-       for (unsigned i = 0; i < existing_v->nr_blocks; i++) {
-               if (stripe_blockcount_get(existing_v, i)) {
-                       __set_bit(i, s->blocks_gotten);
-                       __set_bit(i, s->blocks_allocated);
-               }
-
-               ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
-       }
-
-       bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
-       s->have_existing_stripe = true;
-
-       return 0;
-}
-
-static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h,
-                                      struct ec_stripe_new *s)
-{
-       struct bch_fs *c = trans->c;
-
-       /*
-        * If we can't allocate a new stripe, and there's no stripes with empty
-        * blocks for us to reuse, that means we have to wait on copygc:
-        */
-       if (may_create_new_stripe(c))
-               return -1;
-
-       struct btree_iter lru_iter;
-       struct bkey_s_c lru_k;
-       int ret = 0;
-
-       for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
-                       lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
-                       lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
-                       0, lru_k, ret) {
-               ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &lru_iter);
-       if (!ret)
-               ret = bch_err_throw(c, stripe_alloc_blocked);
-       if (ret == 1)
-               ret = 0;
-       if (ret)
-               return ret;
-
-       return init_new_stripe_from_existing(c, s);
-}
-
-static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h,
-                                        struct ec_stripe_new *s)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bpos min_pos = POS(0, 1);
-       struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
-       int ret;
-
-       if (!s->res.sectors) {
-               ret = bch2_disk_reservation_get(c, &s->res,
-                                       h->blocksize,
-                                       s->nr_parity,
-                                       BCH_DISK_RESERVATION_NOFAIL);
-               if (ret)
-                       return ret;
-       }
-
-       /*
-        * Allocate stripe slot
-        * XXX: we're going to need a bitrange btree of free stripes
-        */
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
-                          BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
-               if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
-                       if (start_pos.offset) {
-                               start_pos = min_pos;
-                               bch2_btree_iter_set_pos(trans, &iter, start_pos);
-                               continue;
-                       }
-
-                       ret = bch_err_throw(c, ENOSPC_stripe_create);
-                       break;
-               }
-
-               if (bkey_deleted(k.k) &&
-                   bch2_try_open_stripe(c, s, k.k->p.offset))
-                       break;
-       }
-
-       c->ec_stripe_hint = iter.pos.offset;
-
-       if (ret)
-               goto err;
-
-       ret = ec_stripe_mem_alloc(trans, &iter);
-       if (ret) {
-               bch2_stripe_close(c, s);
-               goto err;
-       }
-
-       s->new_stripe.key.k.p = iter.pos;
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-err:
-       bch2_disk_reservation_put(c, &s->res);
-       goto out;
-}
-
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
-                                              struct alloc_request *req,
-                                              unsigned algo,
-                                              struct closure *cl)
-{
-       struct bch_fs *c = trans->c;
-       unsigned redundancy = req->nr_replicas - 1;
-       unsigned disk_label = 0;
-       struct target t = target_decode(req->target);
-       bool waiting = false;
-       int ret;
-
-       if (t.type == TARGET_GROUP) {
-               if (t.group > U8_MAX) {
-                       bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
-                       return NULL;
-               }
-               disk_label = t.group + 1; /* 0 == no label */
-       }
-
-       struct ec_stripe_head *h =
-               __bch2_ec_stripe_head_get(trans, disk_label, algo,
-                                         redundancy, req->watermark);
-       if (IS_ERR_OR_NULL(h))
-               return h;
-
-       if (!h->s) {
-               h->s = ec_new_stripe_alloc(c, h);
-               if (!h->s) {
-                       ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc);
-                       bch_err(c, "failed to allocate new stripe");
-                       goto err;
-               }
-
-               h->nr_created++;
-       }
-
-       struct ec_stripe_new *s = h->s;
-
-       if (s->allocated)
-               goto allocated;
-
-       if (s->have_existing_stripe)
-               goto alloc_existing;
-
-       /* First, try to allocate a full stripe: */
-       enum bch_watermark saved_watermark = BCH_WATERMARK_stripe;
-       swap(req->watermark, saved_watermark);
-       ret =   new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
-               __bch2_ec_stripe_head_reserve(trans, h, s);
-       swap(req->watermark, saved_watermark);
-
-       if (!ret)
-               goto allocate_buf;
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-           bch2_err_matches(ret, ENOMEM))
-               goto err;
-
-       /*
-        * Not enough buckets available for a full stripe: we must reuse an
-        * existing stripe:
-        */
-       while (1) {
-               ret = __bch2_ec_stripe_head_reuse(trans, h, s);
-               if (!ret)
-                       break;
-               if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
-                       goto err;
-
-               if (req->watermark == BCH_WATERMARK_copygc) {
-                       ret =   new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
-                               __bch2_ec_stripe_head_reserve(trans, h, s);
-                       if (ret)
-                               goto err;
-                       goto allocate_buf;
-               }
-
-               /* XXX freelist_wait? */
-               closure_wait(&c->freelist_wait, cl);
-               waiting = true;
-       }
-
-       if (waiting)
-               closure_wake_up(&c->freelist_wait);
-alloc_existing:
-       /*
-        * Retry allocating buckets, with the watermark for this
-        * particular write:
-        */
-       ret = new_stripe_alloc_buckets(trans, req, h, s, cl);
-       if (ret)
-               goto err;
-
-allocate_buf:
-       ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize);
-       if (ret)
-               goto err;
-
-       s->allocated = true;
-allocated:
-       BUG_ON(!s->idx);
-       BUG_ON(!s->new_stripe.data[0]);
-       BUG_ON(trans->restarted);
-       return h;
-err:
-       bch2_ec_stripe_head_put(c, h);
-       return ERR_PTR(ret);
-}
-
-/* device removal */
-
-int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
-                                 struct btree_iter *iter,
-                                 struct bkey_s_c k,
-                                 unsigned dev_idx,
-                                 unsigned flags)
-{
-       if (k.k->type != KEY_TYPE_stripe)
-               return 0;
-
-       struct bch_fs *c = trans->c;
-       struct bkey_i_stripe *s =
-               bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe);
-       int ret = PTR_ERR_OR_ZERO(s);
-       if (ret)
-               return ret;
-
-       struct disk_accounting_pos acc;
-
-       s64 sectors = 0;
-       for (unsigned i = 0; i < s->v.nr_blocks; i++)
-               sectors -= stripe_blockcount_get(&s->v, i);
-
-       memset(&acc, 0, sizeof(acc));
-       acc.type = BCH_DISK_ACCOUNTING_replicas;
-       bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
-       acc.replicas.data_type = BCH_DATA_user;
-       ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
-       if (ret)
-               return ret;
-
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
-
-       /* XXX: how much redundancy do we still have? check degraded flags */
-
-       unsigned nr_good = 0;
-
-       scoped_guard(rcu)
-               bkey_for_each_ptr(ptrs, ptr) {
-                       if (ptr->dev == dev_idx)
-                               ptr->dev = BCH_SB_MEMBER_INVALID;
-
-                       struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-                       nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
-               }
-
-       if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
-               return bch_err_throw(c, remove_would_lose_data);
-
-       unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant;
-
-       if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST))
-               return bch_err_throw(c, remove_would_lose_data);
-
-       sectors = -sectors;
-
-       memset(&acc, 0, sizeof(acc));
-       acc.type = BCH_DISK_ACCOUNTING_replicas;
-       bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
-       acc.replicas.data_type = BCH_DATA_user;
-       return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
-}
-
-static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a,
-                                                   unsigned flags)
-{
-       struct bch_alloc_v4 a_convert;
-       const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
-
-       if (!a->stripe)
-               return 0;
-
-       if (a->stripe_sectors) {
-               struct bch_fs *c = trans->c;
-               bch_err(c, "trying to invalidate device in stripe when bucket has stripe data");
-               return bch_err_throw(c, invalidate_stripe_to_dev);
-       }
-
-       struct btree_iter iter;
-       struct bkey_s_c_stripe s =
-               bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
-                                        BTREE_ITER_slots, stripe);
-       int ret = bkey_err(s);
-       if (ret)
-               return ret;
-
-       ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags)
-{
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_max_commit(trans, iter,
-                                 BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
-                                 BTREE_ITER_intent, k,
-                                 NULL, NULL, 0, ({
-                       bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags);
-       })));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-/* startup/shutdown */
-
-static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
-{
-       struct ec_stripe_head *h;
-       struct open_bucket *ob;
-       unsigned i;
-
-       mutex_lock(&c->ec_stripe_head_lock);
-       list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-               mutex_lock(&h->lock);
-               if (!h->s)
-                       goto unlock;
-
-               if (!ca)
-                       goto found;
-
-               for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
-                       if (!h->s->blocks[i])
-                               continue;
-
-                       ob = c->open_buckets + h->s->blocks[i];
-                       if (ob->dev == ca->dev_idx)
-                               goto found;
-               }
-               goto unlock;
-found:
-               ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
-unlock:
-               mutex_unlock(&h->lock);
-       }
-       mutex_unlock(&c->ec_stripe_head_lock);
-}
-
-void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
-{
-       __bch2_ec_stop(c, ca);
-}
-
-void bch2_fs_ec_stop(struct bch_fs *c)
-{
-       __bch2_ec_stop(c, NULL);
-}
-
-static bool bch2_fs_ec_flush_done(struct bch_fs *c)
-{
-       sched_annotate_sleep();
-
-       mutex_lock(&c->ec_stripe_new_lock);
-       bool ret = list_empty(&c->ec_stripe_new_list);
-       mutex_unlock(&c->ec_stripe_new_lock);
-
-       return ret;
-}
-
-void bch2_fs_ec_flush(struct bch_fs *c)
-{
-       wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
-}
-
-int bch2_stripes_read(struct bch_fs *c)
-{
-       return 0;
-}
-
-static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
-                                   struct ec_stripe_new *s)
-{
-       prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
-                  s->idx, s->nr_data, s->nr_parity,
-                  bitmap_weight(s->blocks_allocated, s->nr_data),
-                  atomic_read(&s->ref[STRIPE_REF_io]),
-                  atomic_read(&s->ref[STRIPE_REF_stripe]),
-                  bch2_watermarks[s->h->watermark]);
-
-       struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
-       unsigned i;
-       for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
-               prt_printf(out, " %u", s->blocks[i]);
-       prt_newline(out);
-       bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
-       prt_newline(out);
-}
-
-void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       struct ec_stripe_head *h;
-       struct ec_stripe_new *s;
-
-       mutex_lock(&c->ec_stripe_head_lock);
-       list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-               prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
-                      h->disk_label, h->algo, h->redundancy,
-                      bch2_watermarks[h->watermark],
-                      h->nr_created);
-
-               if (h->s)
-                       bch2_new_stripe_to_text(out, c, h->s);
-       }
-       mutex_unlock(&c->ec_stripe_head_lock);
-
-       prt_printf(out, "in flight:\n");
-
-       mutex_lock(&c->ec_stripe_new_lock);
-       list_for_each_entry(s, &c->ec_stripe_new_list, list)
-               bch2_new_stripe_to_text(out, c, s);
-       mutex_unlock(&c->ec_stripe_new_lock);
-}
-
-void bch2_fs_ec_exit(struct bch_fs *c)
-{
-       struct ec_stripe_head *h;
-       unsigned i;
-
-       while (1) {
-               mutex_lock(&c->ec_stripe_head_lock);
-               h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list);
-               mutex_unlock(&c->ec_stripe_head_lock);
-
-               if (!h)
-                       break;
-
-               if (h->s) {
-                       for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
-                               BUG_ON(h->s->blocks[i]);
-
-                       kfree(h->s);
-               }
-               kfree(h);
-       }
-
-       BUG_ON(!list_empty(&c->ec_stripe_new_list));
-
-       bioset_exit(&c->ec_bioset);
-}
-
-void bch2_fs_ec_init_early(struct bch_fs *c)
-{
-       spin_lock_init(&c->ec_stripes_new_lock);
-
-       INIT_LIST_HEAD(&c->ec_stripe_head_list);
-       mutex_init(&c->ec_stripe_head_lock);
-
-       INIT_LIST_HEAD(&c->ec_stripe_new_list);
-       mutex_init(&c->ec_stripe_new_lock);
-       init_waitqueue_head(&c->ec_stripe_new_wait);
-
-       INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
-       INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
-}
-
-int bch2_fs_ec_init(struct bch_fs *c)
-{
-       return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
-                          BIOSET_NEED_BVECS);
-}
-
-static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
-                                       struct bkey_s_c k,
-                                       struct bkey_buf *last_flushed)
-{
-       if (k.k->type != KEY_TYPE_stripe)
-               return 0;
-
-       struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-       u64 lru_idx = stripe_lru_pos(s.v);
-       if (lru_idx) {
-               int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
-                                            k.k->p.offset, lru_idx, k, last_flushed);
-               if (ret)
-                       return ret;
-       }
-       return 0;
-}
-
-int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
-{
-       struct bkey_buf last_flushed;
-
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
-                               POS_MIN, BTREE_ITER_prefetch, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
-
-       bch2_bkey_buf_exit(&last_flushed, c);
-       bch_err_fn(c, ret);
-       return ret;
-}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
deleted file mode 100644 (file)
index 548048a..0000000
+++ /dev/null
@@ -1,309 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_H
-#define _BCACHEFS_EC_H
-
-#include "ec_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-
-int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c,
-                        struct bkey_validate_context);
-void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
-                        struct bkey_s_c);
-int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
-                       struct bkey_s_c, struct bkey_s,
-                       enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_stripe ((struct bkey_ops) {      \
-       .key_validate   = bch2_stripe_validate,         \
-       .val_to_text    = bch2_stripe_to_text,          \
-       .swab           = bch2_ptr_swab,                \
-       .trigger        = bch2_trigger_stripe,          \
-       .min_val_size   = 8,                            \
-})
-
-static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
-{
-       return DIV_ROUND_UP(le16_to_cpu(s->sectors),
-                           1 << s->csum_granularity_bits);
-}
-
-static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
-                                         unsigned dev, unsigned csum_idx)
-{
-       EBUG_ON(s->csum_type >= BCH_CSUM_NR);
-
-       unsigned csum_bytes = bch_crc_bytes[s->csum_type];
-
-       return sizeof(struct bch_stripe) +
-               sizeof(struct bch_extent_ptr) * s->nr_blocks +
-               (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
-}
-
-static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
-                                               unsigned idx)
-{
-       return stripe_csum_offset(s, s->nr_blocks, 0) +
-               sizeof(u16) * idx;
-}
-
-static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
-                                            unsigned idx)
-{
-       return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
-}
-
-static inline void stripe_blockcount_set(struct bch_stripe *s,
-                                        unsigned idx, unsigned v)
-{
-       __le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
-
-       *p = cpu_to_le16(v);
-}
-
-static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
-{
-       return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
-                           sizeof(u64));
-}
-
-static inline void *stripe_csum(struct bch_stripe *s,
-                               unsigned block, unsigned csum_idx)
-{
-       EBUG_ON(block >= s->nr_blocks);
-       EBUG_ON(csum_idx >= stripe_csums_per_device(s));
-
-       return (void *) s + stripe_csum_offset(s, block, csum_idx);
-}
-
-static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
-                                  unsigned block, unsigned csum_idx)
-{
-       struct bch_csum csum = { 0 };
-
-       memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
-       return csum;
-}
-
-static inline void stripe_csum_set(struct bch_stripe *s,
-                                  unsigned block, unsigned csum_idx,
-                                  struct bch_csum csum)
-{
-       memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
-}
-
-#define STRIPE_LRU_POS_EMPTY   1
-
-static inline u64 stripe_lru_pos(const struct bch_stripe *s)
-{
-       if (!s)
-               return 0;
-
-       unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
-
-       for (unsigned i = 0; i < nr_data; i++)
-               blocks_empty += !stripe_blockcount_get(s, i);
-
-       /* Will be picked up by the stripe_delete worker */
-       if (blocks_empty == nr_data)
-               return STRIPE_LRU_POS_EMPTY;
-
-       if (!blocks_empty)
-               return 0;
-
-       /* invert: more blocks empty = reuse first */
-       return LRU_TIME_MAX - blocks_empty;
-}
-
-static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
-                                            const struct bch_extent_ptr *data_ptr,
-                                            unsigned sectors)
-{
-       return  (data_ptr->dev    == stripe_ptr->dev ||
-                data_ptr->dev    == BCH_SB_MEMBER_INVALID ||
-                stripe_ptr->dev  == BCH_SB_MEMBER_INVALID) &&
-               data_ptr->gen    == stripe_ptr->gen &&
-               data_ptr->offset >= stripe_ptr->offset &&
-               data_ptr->offset  < stripe_ptr->offset + sectors;
-}
-
-static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
-                                          struct extent_ptr_decoded p)
-{
-       unsigned nr_data = s->nr_blocks - s->nr_redundant;
-
-       BUG_ON(!p.has_ec);
-
-       if (p.ec.block >= nr_data)
-               return false;
-
-       return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
-                                        le16_to_cpu(s->sectors));
-}
-
-static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
-                                            struct extent_ptr_decoded p)
-{
-       unsigned nr_data = m->nr_blocks - m->nr_redundant;
-
-       BUG_ON(!p.has_ec);
-
-       if (p.ec.block >= nr_data)
-               return false;
-
-       return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
-                                        m->sectors);
-}
-
-static inline void gc_stripe_unlock(struct gc_stripe *s)
-{
-       BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
-
-       clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock);
-       smp_mb__after_atomic();
-       wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);
-}
-
-static inline void gc_stripe_lock(struct gc_stripe *s)
-{
-       wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR,
-                        TASK_UNINTERRUPTIBLE);
-}
-
-struct bch_read_bio;
-
-struct ec_stripe_buf {
-       /* might not be buffering the entire stripe: */
-       unsigned                offset;
-       unsigned                size;
-       unsigned long           valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-
-       void                    *data[BCH_BKEY_PTRS_MAX];
-
-       __BKEY_PADDED(key, 255);
-};
-
-struct ec_stripe_head;
-
-enum ec_stripe_ref {
-       STRIPE_REF_io,
-       STRIPE_REF_stripe,
-       STRIPE_REF_NR
-};
-
-struct ec_stripe_new {
-       struct bch_fs           *c;
-       struct ec_stripe_head   *h;
-       struct mutex            lock;
-       struct list_head        list;
-
-       struct hlist_node       hash;
-       u64                     idx;
-
-       struct closure          iodone;
-
-       atomic_t                ref[STRIPE_REF_NR];
-
-       int                     err;
-
-       u8                      nr_data;
-       u8                      nr_parity;
-       bool                    allocated;
-       bool                    pending;
-       bool                    have_existing_stripe;
-
-       unsigned long           blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-       unsigned long           blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-       open_bucket_idx_t       blocks[BCH_BKEY_PTRS_MAX];
-       struct disk_reservation res;
-
-       struct ec_stripe_buf    new_stripe;
-       struct ec_stripe_buf    existing_stripe;
-};
-
-struct ec_stripe_head {
-       struct list_head        list;
-       struct mutex            lock;
-
-       unsigned                disk_label;
-       unsigned                algo;
-       unsigned                redundancy;
-       enum bch_watermark      watermark;
-       bool                    insufficient_devs;
-
-       unsigned long           rw_devs_change_count;
-
-       u64                     nr_created;
-
-       struct bch_devs_mask    devs;
-       unsigned                nr_active_devs;
-
-       unsigned                blocksize;
-
-       struct dev_stripe_state block_stripe;
-       struct dev_stripe_state parity_stripe;
-
-       struct ec_stripe_new    *s;
-};
-
-int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
-
-void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-
-void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
-
-int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
-
-void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-
-struct alloc_request;
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
-                       struct alloc_request *, unsigned, struct closure *);
-
-void bch2_do_stripe_deletes(struct bch_fs *);
-void bch2_ec_do_stripe_creates(struct bch_fs *);
-void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
-
-static inline void ec_stripe_new_get(struct ec_stripe_new *s,
-                                    enum ec_stripe_ref ref)
-{
-       atomic_inc(&s->ref[ref]);
-}
-
-static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
-                                    enum ec_stripe_ref ref)
-{
-       BUG_ON(atomic_read(&s->ref[ref]) <= 0);
-
-       if (atomic_dec_and_test(&s->ref[ref]))
-               switch (ref) {
-               case STRIPE_REF_stripe:
-                       bch2_ec_stripe_new_free(c, s);
-                       break;
-               case STRIPE_REF_io:
-                       bch2_ec_do_stripe_creates(c);
-                       break;
-               default:
-                       BUG();
-               }
-}
-
-int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *,
-                                 struct bkey_s_c, unsigned, unsigned);
-int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned);
-
-void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
-void bch2_fs_ec_stop(struct bch_fs *);
-void bch2_fs_ec_flush(struct bch_fs *);
-
-int bch2_stripes_read(struct bch_fs *);
-
-void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_ec_exit(struct bch_fs *);
-void bch2_fs_ec_init_early(struct bch_fs *);
-int bch2_fs_ec_init(struct bch_fs *);
-
-int bch2_check_stripe_to_lru_refs(struct bch_fs *);
-
-#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
deleted file mode 100644 (file)
index b9770f2..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_FORMAT_H
-#define _BCACHEFS_EC_FORMAT_H
-
-struct bch_stripe {
-       struct bch_val          v;
-       __le16                  sectors;
-       __u8                    algorithm;
-       __u8                    nr_blocks;
-       __u8                    nr_redundant;
-
-       __u8                    csum_granularity_bits;
-       __u8                    csum_type;
-
-       /*
-        * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
-        *
-        * we can manage with this because this only needs to point to a
-        * disk label, not a target:
-        */
-       __u8                    disk_label;
-
-       /*
-        * Variable length sections:
-        * - Pointers
-        * - Checksums
-        *   2D array of [stripe block/device][csum block], with checksum block
-        *   size given by csum_granularity_bits
-        * - Block sector counts: per-block array of u16s
-        *
-        * XXX:
-        * Either checksums should have come last, or we should have included a
-        * checksum_size field (the size in bytes of the checksum itself, not
-        * the blocksize the checksum covers).
-        *
-        * Currently we aren't able to access the block sector counts if the
-        * checksum type is unknown.
-        */
-
-       struct bch_extent_ptr   ptrs[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
deleted file mode 100644 (file)
index 809446c..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_TYPES_H
-#define _BCACHEFS_EC_TYPES_H
-
-#include "bcachefs_format.h"
-
-union bch_replicas_padded {
-       u8                              bytes[struct_size_t(struct bch_replicas_entry_v1,
-                                                           devs, BCH_BKEY_PTRS_MAX)];
-       struct bch_replicas_entry_v1    e;
-};
-
-struct stripe {
-       size_t                  heap_idx;
-       u16                     sectors;
-       u8                      algorithm;
-       u8                      nr_blocks;
-       u8                      nr_redundant;
-       u8                      blocks_nonempty;
-       u8                      disk_label;
-};
-
-struct gc_stripe {
-       u8                      lock;
-       unsigned                alive:1; /* does a corresponding key exist in stripes btree? */
-       u16                     sectors;
-       u8                      nr_blocks;
-       u8                      nr_redundant;
-       u16                     block_sectors[BCH_BKEY_PTRS_MAX];
-       struct bch_extent_ptr   ptrs[BCH_BKEY_PTRS_MAX];
-
-       union bch_replicas_padded r;
-};
-
-#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c
deleted file mode 100644 (file)
index 56ab430..0000000
+++ /dev/null
@@ -1,144 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "enumerated_ref.h"
-#include "util.h"
-
-#include <linux/completion.h>
-
-#ifdef ENUMERATED_REF_DEBUG
-void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
-{
-       BUG_ON(idx >= ref->nr);
-       atomic_long_inc(&ref->refs[idx]);
-}
-
-bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
-       BUG_ON(idx >= ref->nr);
-       return atomic_long_inc_not_zero(&ref->refs[idx]);
-}
-
-bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
-       BUG_ON(idx >= ref->nr);
-       return !ref->dying &&
-               atomic_long_inc_not_zero(&ref->refs[idx]);
-}
-
-void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
-{
-       BUG_ON(idx >= ref->nr);
-       long v = atomic_long_dec_return(&ref->refs[idx]);
-
-       BUG_ON(v < 0);
-       if (v)
-               return;
-
-       for (unsigned i = 0; i < ref->nr; i++)
-               if (atomic_long_read(&ref->refs[i]))
-                       return;
-
-       if (ref->stop_fn)
-               ref->stop_fn(ref);
-       complete(&ref->stop_complete);
-}
-#endif
-
-#ifndef ENUMERATED_REF_DEBUG
-static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref)
-{
-       struct enumerated_ref *ref =
-               container_of(percpu_ref, struct enumerated_ref, ref);
-
-       if (ref->stop_fn)
-               ref->stop_fn(ref);
-       complete(&ref->stop_complete);
-}
-#endif
-
-void enumerated_ref_stop_async(struct enumerated_ref *ref)
-{
-       reinit_completion(&ref->stop_complete);
-
-#ifndef ENUMERATED_REF_DEBUG
-       percpu_ref_kill(&ref->ref);
-#else
-       ref->dying = true;
-       for (unsigned i = 0; i < ref->nr; i++)
-               enumerated_ref_put(ref, i);
-#endif
-}
-
-void enumerated_ref_stop(struct enumerated_ref *ref,
-                        const char * const names[])
-{
-       enumerated_ref_stop_async(ref);
-       while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) {
-               struct printbuf buf = PRINTBUF;
-
-               prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n");
-               prt_str(&buf, "Outstanding refs:\n");
-               enumerated_ref_to_text(&buf, ref, names);
-               printk(KERN_ERR "%s", buf.buf);
-               printbuf_exit(&buf);
-       }
-}
-
-void enumerated_ref_start(struct enumerated_ref *ref)
-{
-#ifndef ENUMERATED_REF_DEBUG
-       percpu_ref_reinit(&ref->ref);
-#else
-       ref->dying = false;
-       for (unsigned i = 0; i < ref->nr; i++) {
-               BUG_ON(atomic_long_read(&ref->refs[i]));
-               atomic_long_inc(&ref->refs[i]);
-       }
-#endif
-}
-
-void enumerated_ref_exit(struct enumerated_ref *ref)
-{
-#ifndef ENUMERATED_REF_DEBUG
-       percpu_ref_exit(&ref->ref);
-#else
-       kfree(ref->refs);
-       ref->refs = NULL;
-       ref->nr = 0;
-#endif
-}
-
-int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr,
-                       void (*stop_fn)(struct enumerated_ref *))
-{
-       init_completion(&ref->stop_complete);
-       ref->stop_fn = stop_fn;
-
-#ifndef ENUMERATED_REF_DEBUG
-       return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb,
-                           PERCPU_REF_INIT_DEAD, GFP_KERNEL);
-#else
-       ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL);
-       if (!ref->refs)
-               return -ENOMEM;
-
-       ref->nr = nr;
-       return 0;
-#endif
-}
-
-void enumerated_ref_to_text(struct printbuf *out,
-                           struct enumerated_ref *ref,
-                           const char * const names[])
-{
-#ifdef ENUMERATED_REF_DEBUG
-       bch2_printbuf_tabstop_push(out, 32);
-
-       for (unsigned i = 0; i < ref->nr; i++)
-               prt_printf(out, "%s\t%li\n", names[i],
-                          atomic_long_read(&ref->refs[i]));
-#else
-       prt_str(out, "(not in debug mode)\n");
-#endif
-}
diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h
deleted file mode 100644 (file)
index ec01cf5..0000000
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ENUMERATED_REF_H
-#define _BCACHEFS_ENUMERATED_REF_H
-
-#include "enumerated_ref_types.h"
-
-/*
- * A refcount where the users are enumerated: in debug mode, we create sepate
- * refcounts for each user, to make leaks and refcount errors easy to track
- * down:
- */
-
-#ifdef ENUMERATED_REF_DEBUG
-void enumerated_ref_get(struct enumerated_ref *, unsigned);
-bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned);
-bool enumerated_ref_tryget(struct enumerated_ref *, unsigned);
-void enumerated_ref_put(struct enumerated_ref *, unsigned);
-#else
-
-static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
-{
-       percpu_ref_get(&ref->ref);
-}
-
-static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
-       return percpu_ref_tryget(&ref->ref);
-}
-
-static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
-       return percpu_ref_tryget_live(&ref->ref);
-}
-
-static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
-{
-       percpu_ref_put(&ref->ref);
-}
-#endif
-
-static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref)
-{
-#ifndef ENUMERATED_REF_DEBUG
-       return percpu_ref_is_zero(&ref->ref);
-#else
-       for (unsigned i = 0; i < ref->nr; i++)
-               if (atomic_long_read(&ref->refs[i]))
-                       return false;
-       return true;
-#endif
-}
-
-void enumerated_ref_stop_async(struct enumerated_ref *);
-void enumerated_ref_stop(struct enumerated_ref *, const char * const[]);
-void enumerated_ref_start(struct enumerated_ref *);
-
-void enumerated_ref_exit(struct enumerated_ref *);
-int enumerated_ref_init(struct enumerated_ref *, unsigned,
-                       void (*stop_fn)(struct enumerated_ref *));
-
-struct printbuf;
-void enumerated_ref_to_text(struct printbuf *,
-                           struct enumerated_ref *,
-                           const char * const[]);
-
-#endif /* _BCACHEFS_ENUMERATED_REF_H */
diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h
deleted file mode 100644 (file)
index 0e6076f..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H
-#define _BCACHEFS_ENUMERATED_REF_TYPES_H
-
-#include <linux/percpu-refcount.h>
-
-struct enumerated_ref {
-#ifdef ENUMERATED_REF_DEBUG
-       unsigned                nr;
-       bool                    dying;
-       atomic_long_t           *refs;
-#else
-       struct percpu_ref       ref;
-#endif
-       void                    (*stop_fn)(struct enumerated_ref *);
-       struct completion       stop_complete;
-};
-
-#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
deleted file mode 100644 (file)
index c39cf30..0000000
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "errcode.h"
-#include "trace.h"
-
-#include <linux/errname.h>
-
-static const char * const bch2_errcode_strs[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
-       BCH_ERRCODES()
-#undef x
-       NULL
-};
-
-static const unsigned bch2_errcode_parents[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
-       BCH_ERRCODES()
-#undef x
-};
-
-__attribute__((const))
-const char *bch2_err_str(int err)
-{
-       const char *errstr;
-
-       err = abs(err);
-
-       BUG_ON(err >= BCH_ERR_MAX);
-
-       if (err >= BCH_ERR_START)
-               errstr = bch2_errcode_strs[err - BCH_ERR_START];
-       else if (err)
-               errstr = errname(err);
-       else
-               errstr = "(No error)";
-       return errstr ?: "(Invalid error)";
-}
-
-__attribute__((const))
-bool __bch2_err_matches(int err, int class)
-{
-       err     = abs(err);
-       class   = abs(class);
-
-       BUG_ON(err      >= BCH_ERR_MAX);
-       BUG_ON(class    >= BCH_ERR_MAX);
-
-       while (err >= BCH_ERR_START && err != class)
-               err = bch2_errcode_parents[err - BCH_ERR_START];
-
-       return err == class;
-}
-
-int __bch2_err_class(int bch_err)
-{
-       int std_err = -bch_err;
-       BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
-
-       while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
-               std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
-
-       trace_error_downcast(bch_err, std_err, _RET_IP_);
-
-       return -std_err;
-}
-
-const char *bch2_blk_status_to_str(blk_status_t status)
-{
-       if (status == BLK_STS_REMOVED)
-               return "device removed";
-       return blk_status_to_str(status);
-}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
deleted file mode 100644 (file)
index acc3b7b..0000000
+++ /dev/null
@@ -1,387 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ERRCODE_H
-#define _BCACHEFS_ERRCODE_H
-
-#define BCH_ERRCODES()                                                         \
-       x(ERANGE,                       ERANGE_option_too_small)                \
-       x(ERANGE,                       ERANGE_option_too_big)                  \
-       x(EINVAL,                       injected)                               \
-       x(BCH_ERR_injected,             injected_fs_start)                      \
-       x(EINVAL,                       mount_option)                           \
-       x(BCH_ERR_mount_option,         option_name)                            \
-       x(BCH_ERR_mount_option,         option_value)                           \
-       x(BCH_ERR_mount_option,         option_not_bool)                        \
-       x(ENOMEM,                       ENOMEM_stripe_buf)                      \
-       x(ENOMEM,                       ENOMEM_replicas_table)                  \
-       x(ENOMEM,                       ENOMEM_cpu_replicas)                    \
-       x(ENOMEM,                       ENOMEM_replicas_gc)                     \
-       x(ENOMEM,                       ENOMEM_disk_groups_validate)            \
-       x(ENOMEM,                       ENOMEM_disk_groups_to_cpu)              \
-       x(ENOMEM,                       ENOMEM_mark_snapshot)                   \
-       x(ENOMEM,                       ENOMEM_mark_stripe)                     \
-       x(ENOMEM,                       ENOMEM_mark_stripe_ptr)                 \
-       x(ENOMEM,                       ENOMEM_btree_key_cache_create)          \
-       x(ENOMEM,                       ENOMEM_btree_key_cache_fill)            \
-       x(ENOMEM,                       ENOMEM_btree_key_cache_insert)          \
-       x(ENOMEM,                       ENOMEM_trans_kmalloc)                   \
-       x(ENOMEM,                       ENOMEM_trans_log_msg)                   \
-       x(ENOMEM,                       ENOMEM_do_encrypt)                      \
-       x(ENOMEM,                       ENOMEM_ec_read_extent)                  \
-       x(ENOMEM,                       ENOMEM_ec_stripe_mem_alloc)             \
-       x(ENOMEM,                       ENOMEM_ec_new_stripe_alloc)             \
-       x(ENOMEM,                       ENOMEM_fs_btree_cache_init)             \
-       x(ENOMEM,                       ENOMEM_fs_btree_key_cache_init)         \
-       x(ENOMEM,                       ENOMEM_fs_counters_init)                \
-       x(ENOMEM,                       ENOMEM_fs_btree_write_buffer_init)      \
-       x(ENOMEM,                       ENOMEM_io_clock_init)                   \
-       x(ENOMEM,                       ENOMEM_blacklist_table_init)            \
-       x(ENOMEM,                       ENOMEM_sb_realloc_injected)             \
-       x(ENOMEM,                       ENOMEM_sb_bio_realloc)                  \
-       x(ENOMEM,                       ENOMEM_sb_buf_realloc)                  \
-       x(ENOMEM,                       ENOMEM_sb_journal_validate)             \
-       x(ENOMEM,                       ENOMEM_sb_journal_v2_validate)          \
-       x(ENOMEM,                       ENOMEM_journal_entry_add)               \
-       x(ENOMEM,                       ENOMEM_journal_read_buf_realloc)        \
-       x(ENOMEM,                       ENOMEM_btree_interior_update_worker_init)\
-       x(ENOMEM,                       ENOMEM_btree_interior_update_pool_init) \
-       x(ENOMEM,                       ENOMEM_bio_read_init)                   \
-       x(ENOMEM,                       ENOMEM_bio_read_split_init)             \
-       x(ENOMEM,                       ENOMEM_bio_write_init)                  \
-       x(ENOMEM,                       ENOMEM_bio_bounce_pages_init)           \
-       x(ENOMEM,                       ENOMEM_writepage_bioset_init)           \
-       x(ENOMEM,                       ENOMEM_dio_read_bioset_init)            \
-       x(ENOMEM,                       ENOMEM_dio_write_bioset_init)           \
-       x(ENOMEM,                       ENOMEM_nocow_flush_bioset_init)         \
-       x(ENOMEM,                       ENOMEM_promote_table_init)              \
-       x(ENOMEM,                       ENOMEM_async_obj_init)                  \
-       x(ENOMEM,                       ENOMEM_compression_bounce_read_init)    \
-       x(ENOMEM,                       ENOMEM_compression_bounce_write_init)   \
-       x(ENOMEM,                       ENOMEM_compression_workspace_init)      \
-       x(ENOMEM,                       ENOMEM_backpointer_mismatches_bitmap)   \
-       x(EIO,                          compression_workspace_not_initialized)  \
-       x(ENOMEM,                       ENOMEM_bucket_gens)                     \
-       x(ENOMEM,                       ENOMEM_buckets_nouse)                   \
-       x(ENOMEM,                       ENOMEM_usage_init)                      \
-       x(ENOMEM,                       ENOMEM_btree_node_read_all_replicas)    \
-       x(ENOMEM,                       ENOMEM_btree_node_reclaim)              \
-       x(ENOMEM,                       ENOMEM_btree_node_mem_alloc)            \
-       x(ENOMEM,                       ENOMEM_btree_cache_cannibalize_lock)    \
-       x(ENOMEM,                       ENOMEM_buckets_waiting_for_journal_init)\
-       x(ENOMEM,                       ENOMEM_buckets_waiting_for_journal_set) \
-       x(ENOMEM,                       ENOMEM_set_nr_journal_buckets)          \
-       x(ENOMEM,                       ENOMEM_dev_journal_init)                \
-       x(ENOMEM,                       ENOMEM_journal_pin_fifo)                \
-       x(ENOMEM,                       ENOMEM_journal_buf)                     \
-       x(ENOMEM,                       ENOMEM_gc_start)                        \
-       x(ENOMEM,                       ENOMEM_gc_alloc_start)                  \
-       x(ENOMEM,                       ENOMEM_gc_reflink_start)                \
-       x(ENOMEM,                       ENOMEM_gc_gens)                         \
-       x(ENOMEM,                       ENOMEM_gc_repair_key)                   \
-       x(ENOMEM,                       ENOMEM_fsck_extent_ends_at)             \
-       x(ENOMEM,                       ENOMEM_fsck_add_nlink)                  \
-       x(ENOMEM,                       ENOMEM_journal_key_insert)              \
-       x(ENOMEM,                       ENOMEM_journal_keys_sort)               \
-       x(ENOMEM,                       ENOMEM_read_superblock_clean)           \
-       x(ENOMEM,                       ENOMEM_fs_alloc)                        \
-       x(ENOMEM,                       ENOMEM_fs_name_alloc)                   \
-       x(ENOMEM,                       ENOMEM_fs_other_alloc)                  \
-       x(ENOMEM,                       ENOMEM_dev_alloc)                       \
-       x(ENOMEM,                       ENOMEM_disk_accounting)                 \
-       x(ENOMEM,                       ENOMEM_stripe_head_alloc)               \
-       x(ENOMEM,                       ENOMEM_journal_read_bucket)             \
-       x(ENOSPC,                       ENOSPC_disk_reservation)                \
-       x(ENOSPC,                       ENOSPC_bucket_alloc)                    \
-       x(ENOSPC,                       ENOSPC_disk_label_add)                  \
-       x(ENOSPC,                       ENOSPC_stripe_create)                   \
-       x(ENOSPC,                       ENOSPC_inode_create)                    \
-       x(ENOSPC,                       ENOSPC_str_hash_create)                 \
-       x(ENOSPC,                       ENOSPC_snapshot_create)                 \
-       x(ENOSPC,                       ENOSPC_subvolume_create)                \
-       x(ENOSPC,                       ENOSPC_sb)                              \
-       x(ENOSPC,                       ENOSPC_sb_journal)                      \
-       x(ENOSPC,                       ENOSPC_sb_journal_seq_blacklist)        \
-       x(ENOSPC,                       ENOSPC_sb_quota)                        \
-       x(ENOSPC,                       ENOSPC_sb_replicas)                     \
-       x(ENOSPC,                       ENOSPC_sb_members)                      \
-       x(ENOSPC,                       ENOSPC_sb_members_v2)                   \
-       x(ENOSPC,                       ENOSPC_sb_crypt)                        \
-       x(ENOSPC,                       ENOSPC_sb_downgrade)                    \
-       x(ENOSPC,                       ENOSPC_btree_slot)                      \
-       x(ENOSPC,                       ENOSPC_snapshot_tree)                   \
-       x(ENOENT,                       ENOENT_bkey_type_mismatch)              \
-       x(ENOENT,                       ENOENT_str_hash_lookup)                 \
-       x(ENOENT,                       ENOENT_str_hash_set_must_replace)       \
-       x(ENOENT,                       ENOENT_inode)                           \
-       x(ENOENT,                       ENOENT_not_subvol)                      \
-       x(ENOENT,                       ENOENT_not_directory)                   \
-       x(ENOENT,                       ENOENT_directory_dead)                  \
-       x(ENOENT,                       ENOENT_subvolume)                       \
-       x(ENOENT,                       ENOENT_snapshot_tree)                   \
-       x(ENOENT,                       ENOENT_dirent_doesnt_match_inode)       \
-       x(ENOENT,                       ENOENT_dev_not_found)                   \
-       x(ENOENT,                       ENOENT_dev_bucket_not_found)            \
-       x(ENOENT,                       ENOENT_dev_idx_not_found)               \
-       x(ENOENT,                       ENOENT_inode_no_backpointer)            \
-       x(ENOENT,                       ENOENT_no_snapshot_tree_subvol)         \
-       x(ENOENT,                       btree_node_dying)                       \
-       x(ENOTEMPTY,                    ENOTEMPTY_dir_not_empty)                \
-       x(ENOTEMPTY,                    ENOTEMPTY_subvol_not_empty)             \
-       x(EEXIST,                       EEXIST_str_hash_set)                    \
-       x(EEXIST,                       EEXIST_discard_in_flight_add)           \
-       x(EEXIST,                       EEXIST_subvolume_create)                \
-       x(ENOSPC,                       open_buckets_empty)                     \
-       x(ENOSPC,                       freelist_empty)                         \
-       x(BCH_ERR_freelist_empty,       no_buckets_found)                       \
-       x(0,                            transaction_restart)                    \
-       x(BCH_ERR_transaction_restart,  transaction_restart_fault_inject)       \
-       x(BCH_ERR_transaction_restart,  transaction_restart_relock)             \
-       x(BCH_ERR_transaction_restart,  transaction_restart_relock_path)        \
-       x(BCH_ERR_transaction_restart,  transaction_restart_relock_path_intent) \
-       x(BCH_ERR_transaction_restart,  transaction_restart_too_many_iters)     \
-       x(BCH_ERR_transaction_restart,  transaction_restart_lock_node_reused)   \
-       x(BCH_ERR_transaction_restart,  transaction_restart_fill_relock)        \
-       x(BCH_ERR_transaction_restart,  transaction_restart_fill_mem_alloc_fail)\
-       x(BCH_ERR_transaction_restart,  transaction_restart_mem_realloced)      \
-       x(BCH_ERR_transaction_restart,  transaction_restart_in_traverse_all)    \
-       x(BCH_ERR_transaction_restart,  transaction_restart_would_deadlock)     \
-       x(BCH_ERR_transaction_restart,  transaction_restart_would_deadlock_write)\
-       x(BCH_ERR_transaction_restart,  transaction_restart_deadlock_recursion_limit)\
-       x(BCH_ERR_transaction_restart,  transaction_restart_upgrade)            \
-       x(BCH_ERR_transaction_restart,  transaction_restart_key_cache_fill)     \
-       x(BCH_ERR_transaction_restart,  transaction_restart_key_cache_raced)    \
-       x(BCH_ERR_transaction_restart,  transaction_restart_split_race)         \
-       x(BCH_ERR_transaction_restart,  transaction_restart_write_buffer_flush) \
-       x(BCH_ERR_transaction_restart,  transaction_restart_nested)             \
-       x(BCH_ERR_transaction_restart,  transaction_restart_commit)             \
-       x(0,                            no_btree_node)                          \
-       x(BCH_ERR_no_btree_node,        no_btree_node_relock)                   \
-       x(BCH_ERR_no_btree_node,        no_btree_node_upgrade)                  \
-       x(BCH_ERR_no_btree_node,        no_btree_node_drop)                     \
-       x(BCH_ERR_no_btree_node,        no_btree_node_lock_root)                \
-       x(BCH_ERR_no_btree_node,        no_btree_node_up)                       \
-       x(BCH_ERR_no_btree_node,        no_btree_node_down)                     \
-       x(BCH_ERR_no_btree_node,        no_btree_node_init)                     \
-       x(BCH_ERR_no_btree_node,        no_btree_node_cached)                   \
-       x(BCH_ERR_no_btree_node,        no_btree_node_srcu_reset)               \
-       x(0,                            btree_insert_fail)                      \
-       x(BCH_ERR_btree_insert_fail,    btree_insert_btree_node_full)           \
-       x(BCH_ERR_btree_insert_fail,    btree_insert_need_mark_replicas)        \
-       x(BCH_ERR_btree_insert_fail,    btree_insert_need_journal_res)          \
-       x(BCH_ERR_btree_insert_fail,    btree_insert_need_journal_reclaim)      \
-       x(0,                            backpointer_to_overwritten_btree_node)  \
-       x(0,                            journal_reclaim_would_deadlock)         \
-       x(EINVAL,                       fsck)                                   \
-       x(BCH_ERR_fsck,                 fsck_ask)                               \
-       x(BCH_ERR_fsck,                 fsck_fix)                               \
-       x(BCH_ERR_fsck,                 fsck_delete_bkey)                       \
-       x(BCH_ERR_fsck,                 fsck_ignore)                            \
-       x(BCH_ERR_fsck,                 fsck_errors_not_fixed)                  \
-       x(BCH_ERR_fsck,                 fsck_repair_unimplemented)              \
-       x(BCH_ERR_fsck,                 fsck_repair_impossible)                 \
-       x(EINVAL,                       recovery_will_run)                      \
-       x(BCH_ERR_recovery_will_run,    restart_recovery)                       \
-       x(BCH_ERR_recovery_will_run,    cannot_rewind_recovery)                 \
-       x(BCH_ERR_recovery_will_run,    recovery_pass_will_run)                 \
-       x(0,                            data_update_done)                       \
-       x(0,                            bkey_was_deleted)                       \
-       x(BCH_ERR_data_update_done,     data_update_done_would_block)           \
-       x(BCH_ERR_data_update_done,     data_update_done_unwritten)             \
-       x(BCH_ERR_data_update_done,     data_update_done_no_writes_needed)      \
-       x(BCH_ERR_data_update_done,     data_update_done_no_snapshot)           \
-       x(BCH_ERR_data_update_done,     data_update_done_no_dev_refs)           \
-       x(BCH_ERR_data_update_done,     data_update_done_no_rw_devs)            \
-       x(EINVAL,                       device_state_not_allowed)               \
-       x(EINVAL,                       member_info_missing)                    \
-       x(EINVAL,                       mismatched_block_size)                  \
-       x(EINVAL,                       block_size_too_small)                   \
-       x(EINVAL,                       bucket_size_too_small)                  \
-       x(EINVAL,                       device_size_too_small)                  \
-       x(EINVAL,                       device_size_too_big)                    \
-       x(EINVAL,                       device_not_a_member_of_filesystem)      \
-       x(EINVAL,                       device_has_been_removed)                \
-       x(EINVAL,                       device_splitbrain)                      \
-       x(EINVAL,                       device_already_online)                  \
-       x(EINVAL,                       filesystem_uuid_already_open)           \
-       x(EINVAL,                       insufficient_devices_to_start)          \
-       x(EINVAL,                       invalid)                                \
-       x(EINVAL,                       internal_fsck_err)                      \
-       x(EINVAL,                       opt_parse_error)                        \
-       x(EINVAL,                       remove_with_metadata_missing_unimplemented)\
-       x(EINVAL,                       remove_would_lose_data)                 \
-       x(EINVAL,                       no_resize_with_buckets_nouse)           \
-       x(EINVAL,                       inode_unpack_error)                     \
-       x(EINVAL,                       inode_not_unlinked)                     \
-       x(EINVAL,                       inode_has_child_snapshot)               \
-       x(EINVAL,                       varint_decode_error)                    \
-       x(EINVAL,                       erasure_coding_found_btree_node)        \
-       x(EINVAL,                       option_negative)                        \
-       x(EOPNOTSUPP,                   may_not_use_incompat_feature)           \
-       x(EROFS,                        erofs_trans_commit)                     \
-       x(EROFS,                        erofs_no_writes)                        \
-       x(EROFS,                        erofs_journal_err)                      \
-       x(EROFS,                        erofs_sb_err)                           \
-       x(EROFS,                        erofs_unfixed_errors)                   \
-       x(EROFS,                        erofs_norecovery)                       \
-       x(EROFS,                        erofs_nochanges)                        \
-       x(EROFS,                        erofs_no_alloc_info)                    \
-       x(EROFS,                        erofs_filesystem_full)                  \
-       x(EROFS,                        insufficient_devices)                   \
-       x(0,                            operation_blocked)                      \
-       x(BCH_ERR_operation_blocked,    btree_cache_cannibalize_lock_blocked)   \
-       x(BCH_ERR_operation_blocked,    journal_res_blocked)                    \
-       x(BCH_ERR_journal_res_blocked,  journal_blocked)                        \
-       x(BCH_ERR_journal_res_blocked,  journal_max_in_flight)                  \
-       x(BCH_ERR_journal_res_blocked,  journal_max_open)                       \
-       x(BCH_ERR_journal_res_blocked,  journal_full)                           \
-       x(BCH_ERR_journal_res_blocked,  journal_pin_full)                       \
-       x(BCH_ERR_journal_res_blocked,  journal_buf_enomem)                     \
-       x(BCH_ERR_journal_res_blocked,  journal_stuck)                          \
-       x(BCH_ERR_journal_res_blocked,  journal_retry_open)                     \
-       x(BCH_ERR_journal_res_blocked,  bucket_alloc_blocked)                   \
-       x(BCH_ERR_journal_res_blocked,  stripe_alloc_blocked)                   \
-       x(BCH_ERR_invalid,              invalid_sb)                             \
-       x(BCH_ERR_invalid_sb,           invalid_sb_magic)                       \
-       x(BCH_ERR_invalid_sb,           invalid_sb_version)                     \
-       x(BCH_ERR_invalid_sb,           invalid_sb_features)                    \
-       x(BCH_ERR_invalid_sb,           invalid_sb_too_big)                     \
-       x(BCH_ERR_invalid_sb,           invalid_sb_csum_type)                   \
-       x(BCH_ERR_invalid_sb,           invalid_sb_csum)                        \
-       x(BCH_ERR_invalid_sb,           invalid_sb_block_size)                  \
-       x(BCH_ERR_invalid_sb,           invalid_sb_uuid)                        \
-       x(BCH_ERR_invalid_sb,           invalid_sb_offset)                      \
-       x(BCH_ERR_invalid_sb,           invalid_sb_too_many_members)            \
-       x(BCH_ERR_invalid_sb,           invalid_sb_dev_idx)                     \
-       x(BCH_ERR_invalid_sb,           invalid_sb_time_precision)              \
-       x(BCH_ERR_invalid_sb,           invalid_sb_field_size)                  \
-       x(BCH_ERR_invalid_sb,           invalid_sb_layout)                      \
-       x(BCH_ERR_invalid_sb_layout,    invalid_sb_layout_type)                 \
-       x(BCH_ERR_invalid_sb_layout,    invalid_sb_layout_nr_superblocks)       \
-       x(BCH_ERR_invalid_sb_layout,    invalid_sb_layout_superblocks_overlap)  \
-       x(BCH_ERR_invalid_sb_layout,    invalid_sb_layout_sb_max_size_bits)     \
-       x(BCH_ERR_invalid_sb,           invalid_sb_members_missing)             \
-       x(BCH_ERR_invalid_sb,           invalid_sb_members)                     \
-       x(BCH_ERR_invalid_sb,           invalid_sb_disk_groups)                 \
-       x(BCH_ERR_invalid_sb,           invalid_sb_replicas)                    \
-       x(BCH_ERR_invalid_sb,           invalid_replicas_entry)                 \
-       x(BCH_ERR_invalid_sb,           invalid_sb_journal)                     \
-       x(BCH_ERR_invalid_sb,           invalid_sb_journal_seq_blacklist)       \
-       x(BCH_ERR_invalid_sb,           invalid_sb_crypt)                       \
-       x(BCH_ERR_invalid_sb,           invalid_sb_clean)                       \
-       x(BCH_ERR_invalid_sb,           invalid_sb_quota)                       \
-       x(BCH_ERR_invalid_sb,           invalid_sb_errors)                      \
-       x(BCH_ERR_invalid_sb,           invalid_sb_opt_compression)             \
-       x(BCH_ERR_invalid_sb,           invalid_sb_ext)                         \
-       x(BCH_ERR_invalid_sb,           invalid_sb_downgrade)                   \
-       x(BCH_ERR_invalid,              invalid_bkey)                           \
-       x(BCH_ERR_operation_blocked,    nocow_lock_blocked)                     \
-       x(EROFS,                        journal_shutdown)                       \
-       x(EIO,                          journal_flush_err)                      \
-       x(EIO,                          journal_write_err)                      \
-       x(EIO,                          btree_node_read_err)                    \
-       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_cached)             \
-       x(EIO,                          sb_not_downgraded)                      \
-       x(EIO,                          btree_node_write_all_failed)            \
-       x(EIO,                          btree_node_read_error)                  \
-       x(EIO,                          btree_need_topology_repair)             \
-       x(EIO,                          bucket_ref_update)                      \
-       x(EIO,                          trigger_alloc)                          \
-       x(EIO,                          trigger_pointer)                        \
-       x(EIO,                          trigger_stripe_pointer)                 \
-       x(EIO,                          metadata_bucket_inconsistency)          \
-       x(EIO,                          mark_stripe)                            \
-       x(EIO,                          stripe_reconstruct)                     \
-       x(EIO,                          key_type_error)                         \
-       x(EIO,                          extent_poisoned)                        \
-       x(EIO,                          missing_indirect_extent)                \
-       x(EIO,                          invalidate_stripe_to_dev)               \
-       x(EIO,                          no_encryption_key)                      \
-       x(EIO,                          insufficient_journal_devices)           \
-       x(EIO,                          device_offline)                         \
-       x(EIO,                          EIO_fault_injected)                     \
-       x(EIO,                          ec_block_read)                          \
-       x(EIO,                          ec_block_write)                         \
-       x(EIO,                          recompute_checksum)                     \
-       x(EIO,                          decompress)                             \
-       x(BCH_ERR_decompress,           decompress_exceeded_max_encoded_extent) \
-       x(BCH_ERR_decompress,           decompress_lz4)                         \
-       x(BCH_ERR_decompress,           decompress_gzip)                        \
-       x(BCH_ERR_decompress,           decompress_zstd_src_len_bad)            \
-       x(BCH_ERR_decompress,           decompress_zstd)                        \
-       x(EIO,                          data_write)                             \
-       x(BCH_ERR_data_write,           data_write_io)                          \
-       x(BCH_ERR_data_write,           data_write_csum)                        \
-       x(BCH_ERR_data_write,           data_write_invalid_ptr)                 \
-       x(BCH_ERR_data_write,           data_write_misaligned)                  \
-       x(BCH_ERR_decompress,           data_read)                              \
-       x(BCH_ERR_data_read,            no_device_to_read_from)                 \
-       x(BCH_ERR_data_read,            no_devices_valid)                       \
-       x(BCH_ERR_data_read,            data_read_io_err)                       \
-       x(BCH_ERR_data_read,            data_read_csum_err)                     \
-       x(BCH_ERR_data_read,            data_read_retry)                        \
-       x(BCH_ERR_data_read_retry,      data_read_retry_avoid)                  \
-       x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline)         \
-       x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err)                 \
-       x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err)     \
-       x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err)               \
-       x(BCH_ERR_data_read_retry,      data_read_retry_csum_err_maybe_userspace)\
-       x(BCH_ERR_data_read,            data_read_decompress_err)               \
-       x(BCH_ERR_data_read,            data_read_decrypt_err)                  \
-       x(BCH_ERR_data_read,            data_read_ptr_stale_race)               \
-       x(BCH_ERR_data_read_retry,      data_read_ptr_stale_retry)              \
-       x(BCH_ERR_data_read,            data_read_no_encryption_key)            \
-       x(BCH_ERR_data_read,            data_read_buffer_too_small)             \
-       x(BCH_ERR_data_read,            data_read_key_overwritten)              \
-       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_fixable)            \
-       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_want_retry)         \
-       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_must_retry)         \
-       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_bad_node)           \
-       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_incompatible)       \
-       x(0,                            nopromote)                              \
-       x(BCH_ERR_nopromote,            nopromote_may_not)                      \
-       x(BCH_ERR_nopromote,            nopromote_already_promoted)             \
-       x(BCH_ERR_nopromote,            nopromote_unwritten)                    \
-       x(BCH_ERR_nopromote,            nopromote_congested)                    \
-       x(BCH_ERR_nopromote,            nopromote_in_flight)                    \
-       x(BCH_ERR_nopromote,            nopromote_no_writes)                    \
-       x(BCH_ERR_nopromote,            nopromote_enomem)                       \
-       x(0,                            invalid_snapshot_node)                  \
-       x(0,                            option_needs_open_fs)                   \
-       x(0,                            remove_disk_accounting_entry)
-
-enum bch_errcode {
-       BCH_ERR_START           = 2048,
-#define x(class, err) BCH_ERR_##err,
-       BCH_ERRCODES()
-#undef x
-       BCH_ERR_MAX
-};
-
-__attribute__((const)) const char *bch2_err_str(int);
-
-__attribute__((const)) bool __bch2_err_matches(int, int);
-
-__attribute__((const))
-static inline bool _bch2_err_matches(int err, int class)
-{
-       return err < 0 && __bch2_err_matches(err, class);
-}
-
-#define bch2_err_matches(_err, _class)                 \
-({                                                     \
-       BUILD_BUG_ON(!__builtin_constant_p(_class));    \
-       unlikely(_bch2_err_matches(_err, _class));      \
-})
-
-int __bch2_err_class(int);
-
-static inline long bch2_err_class(long err)
-{
-       return err < 0 ? __bch2_err_class(err) : err;
-}
-
-#define BLK_STS_REMOVED                ((__force blk_status_t)128)
-
-#include <linux/blk_types.h>
-const char *bch2_blk_status_to_str(blk_status_t);
-
-#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
deleted file mode 100644 (file)
index 267e73d..0000000
+++ /dev/null
@@ -1,771 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "error.h"
-#include "journal.h"
-#include "namei.h"
-#include "recovery_passes.h"
-#include "super.h"
-#include "thread_with_file.h"
-
-#define FSCK_ERR_RATELIMIT_NR  10
-
-void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out)
-{
-       printbuf_indent_add_nextline(out, 2);
-
-#ifdef BCACHEFS_LOG_PREFIX
-       prt_printf(out, "bcachefs (%s): ", fs_or_dev_name);
-#endif
-}
-
-bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out)
-{
-       set_bit(BCH_FS_error, &c->flags);
-
-       switch (c->opts.errors) {
-       case BCH_ON_ERROR_continue:
-               return false;
-       case BCH_ON_ERROR_fix_safe:
-       case BCH_ON_ERROR_ro:
-               bch2_fs_emergency_read_only2(c, out);
-               return true;
-       case BCH_ON_ERROR_panic:
-               bch2_print_str(c, KERN_ERR, out->buf);
-               panic(bch2_fmt(c, "panic after error"));
-               return true;
-       default:
-               BUG();
-       }
-}
-
-bool bch2_inconsistent_error(struct bch_fs *c)
-{
-       struct printbuf buf = PRINTBUF;
-       buf.atomic++;
-
-       printbuf_indent_add_nextline(&buf, 2);
-
-       bool ret = __bch2_inconsistent_error(c, &buf);
-       if (ret)
-               bch_err(c, "%s", buf.buf);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-__printf(3, 0)
-static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans,
-                                      const char *fmt, va_list args)
-{
-       struct printbuf buf = PRINTBUF;
-       buf.atomic++;
-
-       bch2_log_msg_start(c, &buf);
-
-       prt_vprintf(&buf, fmt, args);
-       prt_newline(&buf);
-
-       if (trans)
-               bch2_trans_updates_to_text(&buf, trans);
-       bool ret = __bch2_inconsistent_error(c, &buf);
-       bch2_print_str(c, KERN_ERR, buf.buf);
-
-       printbuf_exit(&buf);
-       return ret;
-}
-
-bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...)
-{
-       va_list args;
-       va_start(args, fmt);
-       bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args);
-       va_end(args);
-       return ret;
-}
-
-bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...)
-{
-       va_list args;
-       va_start(args, fmt);
-       bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args);
-       va_end(args);
-       return ret;
-}
-
-int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
-{
-       prt_printf(out, "btree topology error: ");
-
-       set_bit(BCH_FS_topology_error, &c->flags);
-       if (!test_bit(BCH_FS_in_recovery, &c->flags)) {
-               __bch2_inconsistent_error(c, out);
-               return bch_err_throw(c, btree_need_topology_repair);
-       } else {
-               return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
-                       bch_err_throw(c, btree_need_topology_repair);
-       }
-}
-
-int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...)
-{
-       struct printbuf buf = PRINTBUF;
-
-       bch2_log_msg_start(c, &buf);
-
-       va_list args;
-       va_start(args, fmt);
-       prt_vprintf(&buf, fmt, args);
-       va_end(args);
-
-       int ret = __bch2_topology_error(c, &buf);
-       bch2_print_str(c, KERN_ERR, buf.buf);
-
-       printbuf_exit(&buf);
-       return ret;
-}
-
-void bch2_fatal_error(struct bch_fs *c)
-{
-       if (bch2_fs_emergency_read_only(c))
-               bch_err(c, "fatal error - emergency read only");
-}
-
-void bch2_io_error_work(struct work_struct *work)
-{
-       struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
-       struct bch_fs *c = ca->fs;
-
-       /* XXX: if it's reads or checksums that are failing, set it to failed */
-
-       down_write(&c->state_lock);
-       unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
-
-       if (write_errors_start &&
-           time_after(jiffies,
-                      write_errors_start + c->opts.write_error_timeout * HZ)) {
-               if (ca->mi.state >= BCH_MEMBER_STATE_ro)
-                       goto out;
-
-               bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
-                                                BCH_FORCE_IF_DEGRADED);
-               struct printbuf buf = PRINTBUF;
-               __bch2_log_msg_start(ca->name, &buf);
-
-               prt_printf(&buf, "writes erroring for %u seconds, setting %s ro",
-                       c->opts.write_error_timeout,
-                       dev ? "device" : "filesystem");
-               if (!dev)
-                       bch2_fs_emergency_read_only2(c, &buf);
-
-               bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-       }
-out:
-       up_write(&c->state_lock);
-}
-
-void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
-{
-       atomic64_inc(&ca->errors[type]);
-
-       if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
-               ca->write_errors_start = jiffies;
-
-       queue_work(system_long_wq, &ca->io_error_work);
-}
-
-enum ask_yn {
-       YN_NO,
-       YN_YES,
-       YN_ALLNO,
-       YN_ALLYES,
-};
-
-static enum ask_yn parse_yn_response(char *buf)
-{
-       buf = strim(buf);
-
-       if (strlen(buf) == 1)
-               switch (buf[0]) {
-               case 'n':
-                       return YN_NO;
-               case 'y':
-                       return YN_YES;
-               case 'N':
-                       return YN_ALLNO;
-               case 'Y':
-                       return YN_ALLYES;
-               }
-       return -1;
-}
-
-#ifdef __KERNEL__
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
-{
-       struct stdio_redirect *stdio = c->stdio;
-
-       if (c->stdio_filter && c->stdio_filter != current)
-               stdio = NULL;
-
-       if (!stdio)
-               return YN_NO;
-
-       if (trans)
-               bch2_trans_unlock(trans);
-
-       unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0;
-       darray_char line = {};
-       int ret;
-
-       do {
-               unsigned long t;
-               bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
-rewait:
-               t = unlock_long_at
-                       ? max_t(long, unlock_long_at - jiffies, 0)
-                       : MAX_SCHEDULE_TIMEOUT;
-
-               int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t);
-               if (r == -ETIME) {
-                       bch2_trans_unlock_long(trans);
-                       unlock_long_at = 0;
-                       goto rewait;
-               }
-
-               if (r < 0) {
-                       ret = YN_NO;
-                       break;
-               }
-
-               darray_last(line) = '\0';
-       } while ((ret = parse_yn_response(line.data)) < 0);
-
-       darray_exit(&line);
-       return ret;
-}
-#else
-
-#include "tools-util.h"
-
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
-{
-       char *buf = NULL;
-       size_t buflen = 0;
-       int ret;
-
-       do {
-               fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
-               fflush(stdout);
-
-               if (getline(&buf, &buflen, stdin) < 0)
-                       die("error reading from standard input");
-       } while ((ret = parse_yn_response(buf)) < 0);
-
-       free(buf);
-       return ret;
-}
-
-#endif
-
-static struct fsck_err_state *fsck_err_get(struct bch_fs *c,
-                                          enum bch_sb_error_id id)
-{
-       struct fsck_err_state *s;
-
-       list_for_each_entry(s, &c->fsck_error_msgs, list)
-               if (s->id == id) {
-                       /*
-                        * move it to the head of the list: repeated fsck errors
-                        * are common
-                        */
-                       list_move(&s->list, &c->fsck_error_msgs);
-                       return s;
-               }
-
-       s = kzalloc(sizeof(*s), GFP_NOFS);
-       if (!s) {
-               if (!c->fsck_alloc_msgs_err)
-                       bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
-               c->fsck_alloc_msgs_err = true;
-               return NULL;
-       }
-
-       INIT_LIST_HEAD(&s->list);
-       s->id = id;
-       list_add(&s->list, &c->fsck_error_msgs);
-       return s;
-}
-
-/* s/fix?/fixing/ s/recreate?/recreating/ */
-static void prt_actioning(struct printbuf *out, const char *action)
-{
-       unsigned len = strlen(action);
-
-       BUG_ON(action[len - 1] != '?');
-       --len;
-
-       if (action[len - 1] == 'e')
-               --len;
-
-       prt_bytes(out, action, len);
-       prt_str(out, "ing");
-}
-
-static const u8 fsck_flags_extra[] = {
-#define x(t, n, flags)         [BCH_FSCK_ERR_##t] = flags,
-       BCH_SB_ERRS()
-#undef x
-};
-
-static int do_fsck_ask_yn(struct bch_fs *c,
-                         struct btree_trans *trans,
-                         struct printbuf *question,
-                         const char *action)
-{
-       prt_str(question, ", ");
-       prt_str(question, action);
-
-       if (bch2_fs_stdio_redirect(c))
-               bch2_print(c, "%s", question->buf);
-       else
-               bch2_print_str(c, KERN_ERR, question->buf);
-
-       int ask = bch2_fsck_ask_yn(c, trans);
-
-       if (trans) {
-               int ret = bch2_trans_relock(trans);
-               if (ret)
-                       return ret;
-       }
-
-       return ask;
-}
-
-static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c,
-                         enum bch_sb_error_id id, const char *msg,
-                         bool *repeat, bool *print, bool *suppress)
-{
-       bch2_sb_error_count(c, id);
-
-       struct fsck_err_state *s = fsck_err_get(c, id);
-       if (s) {
-               /*
-                * We may be called multiple times for the same error on
-                * transaction restart - this memoizes instead of asking the user
-                * multiple times for the same error:
-                */
-               if (s->last_msg && !strcmp(msg, s->last_msg)) {
-                       *repeat = true;
-                       *print = false;
-                       return s;
-               }
-
-               kfree(s->last_msg);
-               s->last_msg = kstrdup(msg, GFP_KERNEL);
-
-               if (c->opts.ratelimit_errors &&
-                   s->nr >= FSCK_ERR_RATELIMIT_NR) {
-                       if (s->nr == FSCK_ERR_RATELIMIT_NR)
-                               *suppress = true;
-                       else
-                               *print = false;
-               }
-
-               s->nr++;
-       }
-       return s;
-}
-
-bool __bch2_count_fsck_err(struct bch_fs *c,
-                          enum bch_sb_error_id id, struct printbuf *msg)
-{
-       bch2_sb_error_count(c, id);
-
-       mutex_lock(&c->fsck_error_msgs_lock);
-       bool print = true, repeat = false, suppress = false;
-
-       count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress);
-       mutex_unlock(&c->fsck_error_msgs_lock);
-
-       if (suppress)
-               prt_printf(msg, "Ratelimiting new instances of previous error\n");
-
-       return print && !repeat;
-}
-
-int bch2_fsck_err_opt(struct bch_fs *c,
-                     enum bch_fsck_flags flags,
-                     enum bch_sb_error_id err)
-{
-       if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
-               flags |= fsck_flags_extra[err];
-
-       if (test_bit(BCH_FS_in_fsck, &c->flags)) {
-               if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE)))
-                       return bch_err_throw(c, fsck_repair_unimplemented);
-
-               switch (c->opts.fix_errors) {
-               case FSCK_FIX_exit:
-                       return bch_err_throw(c, fsck_errors_not_fixed);
-               case FSCK_FIX_yes:
-                       if (flags & FSCK_CAN_FIX)
-                               return bch_err_throw(c, fsck_fix);
-                       fallthrough;
-               case FSCK_FIX_no:
-                       if (flags & FSCK_CAN_IGNORE)
-                               return bch_err_throw(c, fsck_ignore);
-                       return bch_err_throw(c, fsck_errors_not_fixed);
-               case FSCK_FIX_ask:
-                       if (flags & FSCK_AUTOFIX)
-                               return bch_err_throw(c, fsck_fix);
-                       return bch_err_throw(c, fsck_ask);
-               default:
-                       BUG();
-               }
-       } else {
-               if ((flags & FSCK_AUTOFIX) &&
-                   (c->opts.errors == BCH_ON_ERROR_continue ||
-                    c->opts.errors == BCH_ON_ERROR_fix_safe))
-                       return bch_err_throw(c, fsck_fix);
-
-               if (c->opts.errors == BCH_ON_ERROR_continue &&
-                   (flags & FSCK_CAN_IGNORE))
-                       return bch_err_throw(c, fsck_ignore);
-               return bch_err_throw(c, fsck_errors_not_fixed);
-       }
-}
-
-int __bch2_fsck_err(struct bch_fs *c,
-                 struct btree_trans *trans,
-                 enum bch_fsck_flags flags,
-                 enum bch_sb_error_id err,
-                 const char *fmt, ...)
-{
-       va_list args;
-       struct printbuf buf = PRINTBUF, *out = &buf;
-       int ret = 0;
-       const char *action_orig = "fix?", *action = action_orig;
-
-       might_sleep();
-
-       if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
-               flags |= fsck_flags_extra[err];
-
-       if (!c)
-               c = trans->c;
-
-       /*
-        * Ugly: if there's a transaction in the current task it has to be
-        * passed in to unlock if we prompt for user input.
-        *
-        * But, plumbing a transaction and transaction restarts into
-        * bkey_validate() is problematic.
-        *
-        * So:
-        * - make all bkey errors AUTOFIX, they're simple anyways (we just
-        *   delete the key)
-        * - and we don't need to warn if we're not prompting
-        */
-       WARN_ON((flags & FSCK_CAN_FIX) &&
-               !(flags & FSCK_AUTOFIX) &&
-               !trans &&
-               bch2_current_has_btree_trans(c));
-
-       if (test_bit(err, c->sb.errors_silent))
-               return flags & FSCK_CAN_FIX
-                       ? bch_err_throw(c, fsck_fix)
-                       : bch_err_throw(c, fsck_ignore);
-
-       printbuf_indent_add_nextline(out, 2);
-
-#ifdef BCACHEFS_LOG_PREFIX
-       if (strncmp(fmt, "bcachefs", 8))
-               prt_printf(out, bch2_log_msg(c, ""));
-#endif
-
-       va_start(args, fmt);
-       prt_vprintf(out, fmt, args);
-       va_end(args);
-
-       /* Custom fix/continue/recreate/etc.? */
-       if (out->buf[out->pos - 1] == '?') {
-               const char *p = strrchr(out->buf, ',');
-               if (p) {
-                       out->pos = p - out->buf;
-                       action = kstrdup(p + 2, GFP_KERNEL);
-                       if (!action) {
-                               ret = -ENOMEM;
-                               goto err;
-                       }
-               }
-       }
-
-       mutex_lock(&c->fsck_error_msgs_lock);
-       bool repeat = false, print = true, suppress = false;
-       bool inconsistent = false, exiting = false;
-       struct fsck_err_state *s =
-               count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress);
-       if (repeat) {
-               ret = s->ret;
-               goto err_unlock;
-       }
-
-       if ((flags & FSCK_AUTOFIX) &&
-           (c->opts.errors == BCH_ON_ERROR_continue ||
-            c->opts.errors == BCH_ON_ERROR_fix_safe)) {
-               prt_str(out, ", ");
-               if (flags & FSCK_CAN_FIX) {
-                       prt_actioning(out, action);
-                       ret = bch_err_throw(c, fsck_fix);
-               } else {
-                       prt_str(out, ", continuing");
-                       ret = bch_err_throw(c, fsck_ignore);
-               }
-
-               goto print;
-       } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) {
-               if (c->opts.errors != BCH_ON_ERROR_continue ||
-                   !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
-                       prt_str_indented(out, ", shutting down\n"
-                                        "error not marked as autofix and not in fsck\n"
-                                        "run fsck, and forward to devs so error can be marked for self-healing");
-                       inconsistent = true;
-                       print = true;
-                       ret = bch_err_throw(c, fsck_errors_not_fixed);
-               } else if (flags & FSCK_CAN_FIX) {
-                       prt_str(out, ", ");
-                       prt_actioning(out, action);
-                       ret = bch_err_throw(c, fsck_fix);
-               } else {
-                       prt_str(out, ", continuing");
-                       ret = bch_err_throw(c, fsck_ignore);
-               }
-       } else if (c->opts.fix_errors == FSCK_FIX_exit) {
-               prt_str(out, ", exiting");
-               ret = bch_err_throw(c, fsck_errors_not_fixed);
-       } else if (flags & FSCK_CAN_FIX) {
-               int fix = s && s->fix
-                       ? s->fix
-                       : c->opts.fix_errors;
-
-               if (fix == FSCK_FIX_ask) {
-                       print = false;
-
-                       ret = do_fsck_ask_yn(c, trans, out, action);
-                       if (ret < 0)
-                               goto err_unlock;
-
-                       if (ret >= YN_ALLNO && s)
-                               s->fix = ret == YN_ALLNO
-                                       ? FSCK_FIX_no
-                                       : FSCK_FIX_yes;
-
-                       ret = ret & 1
-                               ? bch_err_throw(c, fsck_fix)
-                               : bch_err_throw(c, fsck_ignore);
-               } else if (fix == FSCK_FIX_yes ||
-                          (c->opts.nochanges &&
-                           !(flags & FSCK_CAN_IGNORE))) {
-                       prt_str(out, ", ");
-                       prt_actioning(out, action);
-                       ret = bch_err_throw(c, fsck_fix);
-               } else {
-                       prt_str(out, ", not ");
-                       prt_actioning(out, action);
-                       ret = bch_err_throw(c, fsck_ignore);
-               }
-       } else {
-               if (flags & FSCK_CAN_IGNORE) {
-                       prt_str(out, ", continuing");
-                       ret = bch_err_throw(c, fsck_ignore);
-               } else {
-                       prt_str(out, " (repair unimplemented)");
-                       ret = bch_err_throw(c, fsck_repair_unimplemented);
-               }
-       }
-
-       if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) &&
-           (c->opts.fix_errors == FSCK_FIX_exit ||
-            !(flags & FSCK_CAN_IGNORE)))
-               ret = bch_err_throw(c, fsck_errors_not_fixed);
-
-       if (test_bit(BCH_FS_in_fsck, &c->flags) &&
-           (!bch2_err_matches(ret, BCH_ERR_fsck_fix) &&
-            !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) {
-               exiting = true;
-               print = true;
-       }
-print:
-       prt_newline(out);
-
-       if (inconsistent)
-               __bch2_inconsistent_error(c, out);
-       else if (exiting)
-               prt_printf(out, "Unable to continue, halting\n");
-       else if (suppress)
-               prt_printf(out, "Ratelimiting new instances of previous error\n");
-
-       if (print) {
-               /* possibly strip an empty line, from printbuf_indent_add */
-               while (out->pos && out->buf[out->pos - 1] == ' ')
-                       --out->pos;
-               printbuf_nul_terminate(out);
-
-               if (bch2_fs_stdio_redirect(c))
-                       bch2_print(c, "%s", out->buf);
-               else
-                       bch2_print_str(c, KERN_ERR, out->buf);
-       }
-
-       if (s)
-               s->ret = ret;
-
-       if (trans &&
-           !(flags & FSCK_ERR_NO_LOG) &&
-           ret == -BCH_ERR_fsck_fix)
-               ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret;
-err_unlock:
-       mutex_unlock(&c->fsck_error_msgs_lock);
-err:
-       /*
-        * We don't yet track whether the filesystem currently has errors, for
-        * log_fsck_err()s: that would require us to track for every error type
-        * which recovery pass corrects it, to get the fsck exit status correct:
-        */
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-               /* nothing */
-       } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) {
-               set_bit(BCH_FS_errors_fixed, &c->flags);
-       } else {
-               set_bit(BCH_FS_errors_not_fixed, &c->flags);
-               set_bit(BCH_FS_error, &c->flags);
-       }
-
-       if (action != action_orig)
-               kfree(action);
-       printbuf_exit(&buf);
-
-       BUG_ON(!ret);
-       return ret;
-}
-
-static const char * const bch2_bkey_validate_contexts[] = {
-#define x(n) #n,
-       BKEY_VALIDATE_CONTEXTS()
-#undef x
-       NULL
-};
-
-int __bch2_bkey_fsck_err(struct bch_fs *c,
-                        struct bkey_s_c k,
-                        struct bkey_validate_context from,
-                        enum bch_sb_error_id err,
-                        const char *fmt, ...)
-{
-       if (from.flags & BCH_VALIDATE_silent)
-               return bch_err_throw(c, fsck_delete_bkey);
-
-       unsigned fsck_flags = 0;
-       if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
-               if (test_bit(err, c->sb.errors_silent))
-                       return bch_err_throw(c, fsck_delete_bkey);
-
-               fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
-       }
-       if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
-               fsck_flags |= fsck_flags_extra[err];
-
-       struct printbuf buf = PRINTBUF;
-       prt_printf(&buf, "invalid bkey in %s",
-                  bch2_bkey_validate_contexts[from.from]);
-
-       if (from.from == BKEY_VALIDATE_journal)
-               prt_printf(&buf, " journal seq=%llu offset=%u",
-                          from.journal_seq, from.journal_offset);
-
-       prt_str(&buf, " btree=");
-       bch2_btree_id_to_text(&buf, from.btree);
-       prt_printf(&buf, " level=%u: ", from.level);
-
-       bch2_bkey_val_to_text(&buf, c, k);
-       prt_newline(&buf);
-
-       va_list args;
-       va_start(args, fmt);
-       prt_vprintf(&buf, fmt, args);
-       va_end(args);
-
-       int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print)
-{
-       struct fsck_err_state *s, *n;
-
-       mutex_lock(&c->fsck_error_msgs_lock);
-
-       list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
-               if (print && s->ratelimited && s->last_msg)
-                       bch_err(c, "Saw %llu errors like:\n  %s", s->nr, s->last_msg);
-
-               list_del(&s->list);
-               kfree(s->last_msg);
-               kfree(s);
-       }
-
-       mutex_unlock(&c->fsck_error_msgs_lock);
-}
-
-void bch2_flush_fsck_errs(struct bch_fs *c)
-{
-       __bch2_flush_fsck_errs(c, true);
-}
-
-void bch2_free_fsck_errs(struct bch_fs *c)
-{
-       __bch2_flush_fsck_errs(c, false);
-}
-
-int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
-                                   subvol_inum inum, u64 offset)
-{
-       u32 restart_count = trans->restart_count;
-       int ret = 0;
-
-       if (inum.subvol) {
-               ret = bch2_inum_to_path(trans, inum, out);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       return ret;
-       }
-       if (!inum.subvol || ret)
-               prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
-       prt_printf(out, " offset %llu: ", offset);
-
-       return trans_was_restarted(trans, restart_count);
-}
-
-void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
-                             subvol_inum inum, u64 offset)
-{
-       bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
-}
-
-int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
-                                       struct bpos pos)
-{
-       int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out);
-       if (ret)
-               return ret;
-
-       prt_printf(out, " offset %llu: ", pos.offset << 8);
-       return 0;
-}
-
-void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
-                                 struct bpos pos)
-{
-       bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
-}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
deleted file mode 100644 (file)
index 0c3c3a2..0000000
+++ /dev/null
@@ -1,258 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ERROR_H
-#define _BCACHEFS_ERROR_H
-
-#include <linux/list.h>
-#include <linux/printk.h>
-#include "bkey_types.h"
-#include "sb-errors.h"
-
-struct bch_dev;
-struct bch_fs;
-struct work_struct;
-
-/*
- * XXX: separate out errors that indicate on disk data is inconsistent, and flag
- * superblock as such
- */
-
-/* Error messages: */
-
-void __bch2_log_msg_start(const char *, struct printbuf *);
-
-static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out)
-{
-       __bch2_log_msg_start(c->name, out);
-}
-
-/*
- * Inconsistency errors: The on disk data is inconsistent. If these occur during
- * initial recovery, they don't indicate a bug in the running code - we walk all
- * the metadata before modifying anything. If they occur at runtime, they
- * indicate either a bug in the running code or (less likely) data is being
- * silently corrupted under us.
- *
- * XXX: audit all inconsistent errors and make sure they're all recoverable, in
- * BCH_ON_ERROR_CONTINUE mode
- */
-
-bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *);
-bool bch2_inconsistent_error(struct bch_fs *);
-__printf(2, 3)
-bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...);
-
-#define bch2_fs_inconsistent_on(cond, ...)                             \
-({                                                                     \
-       bool _ret = unlikely(!!(cond));                                 \
-       if (_ret)                                                       \
-               bch2_fs_inconsistent(__VA_ARGS__);                      \
-       _ret;                                                           \
-})
-
-__printf(2, 3)
-bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...);
-
-#define bch2_trans_inconsistent_on(cond, ...)                          \
-({                                                                     \
-       bool _ret = unlikely(!!(cond));                                 \
-       if (_ret)                                                       \
-               bch2_trans_inconsistent(__VA_ARGS__);                   \
-       _ret;                                                           \
-})
-
-int __bch2_topology_error(struct bch_fs *, struct printbuf *);
-__printf(2, 3)
-int bch2_fs_topology_error(struct bch_fs *, const char *, ...);
-
-/*
- * Fsck errors: inconsistency errors we detect at mount time, and should ideally
- * be able to repair:
- */
-
-struct fsck_err_state {
-       struct list_head        list;
-       enum bch_sb_error_id    id;
-       u64                     nr;
-       bool                    ratelimited;
-       int                     ret;
-       int                     fix;
-       char                    *last_msg;
-};
-
-#define fsck_err_count(_c, _err)       bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
-
-bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *);
-#define bch2_count_fsck_err(_c, _err, ...)                             \
-       __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__)
-
-int bch2_fsck_err_opt(struct bch_fs *,
-                     enum bch_fsck_flags,
-                     enum bch_sb_error_id);
-
-__printf(5, 6) __cold
-int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
-                 enum bch_fsck_flags,
-                 enum bch_sb_error_id,
-                 const char *, ...);
-#define bch2_fsck_err(c, _flags, _err_type, ...)                               \
-       __bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\
-                       type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\
-                       _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
-
-void bch2_flush_fsck_errs(struct bch_fs *);
-void bch2_free_fsck_errs(struct bch_fs *);
-
-#define fsck_err_wrap(_do)                                             \
-({                                                                     \
-       int _ret = _do;                                                 \
-       if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) &&                \
-           !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) {             \
-               ret = _ret;                                             \
-               goto fsck_err;                                          \
-       }                                                               \
-                                                                       \
-       bch2_err_matches(_ret, BCH_ERR_fsck_fix);                       \
-})
-
-#define __fsck_err(...)                fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
-
-/* These macros return true if error should be fixed: */
-
-/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-
-#define __fsck_err_on(cond, c, _flags, _err_type, ...)                 \
-({                                                                     \
-       might_sleep();                                                  \
-                                                                       \
-       if (type_is(c, struct bch_fs *))                                \
-               WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\
-                                                                       \
-       (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
-})
-
-#define mustfix_fsck_err(c, _err_type, ...)                            \
-       __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-
-#define mustfix_fsck_err_on(cond, c, _err_type, ...)                   \
-       __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-
-#define fsck_err(c, _err_type, ...)                                    \
-       __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define fsck_err_on(cond, c, _err_type, ...)                           \
-       __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define log_fsck_err(c, _err_type, ...)                                        \
-       __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define log_fsck_err_on(cond, ...)                                     \
-({                                                                     \
-       bool _ret = unlikely(!!(cond));                                 \
-       if (_ret)                                                       \
-               log_fsck_err(__VA_ARGS__);                              \
-       _ret;                                                           \
-})
-
-enum bch_validate_flags;
-__printf(5, 6)
-int __bch2_bkey_fsck_err(struct bch_fs *,
-                        struct bkey_s_c,
-                        struct bkey_validate_context from,
-                        enum bch_sb_error_id,
-                        const char *, ...);
-
-/*
- * for now, bkey fsck errors are always handled by deleting the entire key -
- * this will change at some point
- */
-#define bkey_fsck_err(c, _err_type, _err_msg, ...)                     \
-do {                                                                   \
-       int _ret = __bch2_bkey_fsck_err(c, k, from,                     \
-                               BCH_FSCK_ERR_##_err_type,               \
-                               _err_msg, ##__VA_ARGS__);               \
-       if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) &&                \
-           !bch2_err_matches(_ret, BCH_ERR_fsck_ignore))               \
-               ret = _ret;                                             \
-       ret = bch_err_throw(c, fsck_delete_bkey);                       \
-       goto fsck_err;                                                  \
-} while (0)
-
-#define bkey_fsck_err_on(cond, ...)                                    \
-do {                                                                   \
-       if (unlikely(cond))                                             \
-               bkey_fsck_err(__VA_ARGS__);                             \
-} while (0)
-
-/*
- * Fatal errors: these don't indicate a bug, but we can't continue running in RW
- * mode - pretty much just due to metadata IO errors:
- */
-
-void bch2_fatal_error(struct bch_fs *);
-
-#define bch2_fs_fatal_error(c, _msg, ...)                              \
-do {                                                                   \
-       bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \
-       bch2_fatal_error(c);                                            \
-} while (0)
-
-#define bch2_fs_fatal_err_on(cond, c, ...)                             \
-({                                                                     \
-       bool _ret = unlikely(!!(cond));                                 \
-                                                                       \
-       if (_ret)                                                       \
-               bch2_fs_fatal_error(c, __VA_ARGS__);                    \
-       _ret;                                                           \
-})
-
-/*
- * IO errors: either recoverable metadata IO (because we have replicas), or data
- * IO - we need to log it and print out a message, but we don't (necessarily)
- * want to shut down the fs:
- */
-
-void bch2_io_error_work(struct work_struct *);
-
-/* Does the error handling without logging a message */
-void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
-static inline void bch2_account_io_success_fail(struct bch_dev *ca,
-                                               enum bch_member_error_type type,
-                                               bool success)
-{
-       if (likely(success)) {
-               if (type == BCH_MEMBER_ERROR_write &&
-                   ca->write_errors_start)
-                       ca->write_errors_start = 0;
-       } else {
-               bch2_io_error(ca, type);
-       }
-}
-
-static inline void bch2_account_io_completion(struct bch_dev *ca,
-                                             enum bch_member_error_type type,
-                                             u64 submit_time, bool success)
-{
-       if (unlikely(!ca))
-               return;
-
-       if (type != BCH_MEMBER_ERROR_checksum)
-               bch2_latency_acct(ca, submit_time, type);
-
-       bch2_account_io_success_fail(ca, type, success);
-}
-
-int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
-
-void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
-
-int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
-void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
-
-#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
deleted file mode 100644 (file)
index e76e58a..0000000
+++ /dev/null
@@ -1,155 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "debug.h"
-#include "extents.h"
-#include "extent_update.h"
-
-/*
- * This counts the number of iterators to the alloc & ec btrees we'll need
- * inserting/removing this extent:
- */
-static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       unsigned ret = 0, lru = 0;
-
-       bkey_extent_entry_for_each(ptrs, entry) {
-               switch (__extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-                       /* Might also be updating LRU btree */
-                       if (entry->ptr.cached)
-                               lru++;
-
-                       fallthrough;
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       ret++;
-               }
-       }
-
-       /*
-        * Updating keys in the alloc btree may also update keys in the
-        * freespace or discard btrees:
-        */
-       return lru + ret * 2;
-}
-
-#define EXTENT_ITERS_MAX       64
-
-static int count_iters_for_insert(struct btree_trans *trans,
-                                 struct bkey_s_c k,
-                                 unsigned offset,
-                                 struct bpos *end,
-                                 unsigned *nr_iters)
-{
-       int ret = 0, ret2 = 0;
-
-       if (*nr_iters >= EXTENT_ITERS_MAX) {
-               *end = bpos_min(*end, k.k->p);
-               ret = 1;
-       }
-
-       switch (k.k->type) {
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reflink_v:
-               *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
-
-               if (*nr_iters >= EXTENT_ITERS_MAX) {
-                       *end = bpos_min(*end, k.k->p);
-                       ret = 1;
-               }
-
-               break;
-       case KEY_TYPE_reflink_p: {
-               struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-               u64 idx = REFLINK_P_IDX(p.v);
-               unsigned sectors = bpos_min(*end, p.k->p).offset -
-                       bkey_start_offset(p.k);
-               struct btree_iter iter;
-               struct bkey_s_c r_k;
-
-               for_each_btree_key_norestart(trans, iter,
-                                  BTREE_ID_reflink, POS(0, idx + offset),
-                                  BTREE_ITER_slots, r_k, ret2) {
-                       if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
-                               break;
-
-                       /* extent_update_to_keys(), for the reflink_v update */
-                       *nr_iters += 1;
-
-                       *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
-
-                       if (*nr_iters >= EXTENT_ITERS_MAX) {
-                               struct bpos pos = bkey_start_pos(k.k);
-                               pos.offset += min_t(u64, k.k->size,
-                                                   r_k.k->p.offset - idx);
-
-                               *end = bpos_min(*end, pos);
-                               ret = 1;
-                               break;
-                       }
-               }
-               bch2_trans_iter_exit(trans, &iter);
-
-               break;
-       }
-       }
-
-       return ret2 ?: ret;
-}
-
-int bch2_extent_atomic_end(struct btree_trans *trans,
-                          struct btree_iter *iter,
-                          struct bpos *end)
-{
-       unsigned nr_iters = 0;
-
-       struct btree_iter copy;
-       bch2_trans_copy_iter(trans, &copy, iter);
-
-       int ret = bch2_btree_iter_traverse(trans, &copy);
-       if (ret)
-               goto err;
-
-       struct bkey_s_c k;
-       for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) {
-               unsigned offset = 0;
-
-               if (bkey_gt(iter->pos, bkey_start_pos(k.k)))
-                       offset = iter->pos.offset - bkey_start_offset(k.k);
-
-               ret = count_iters_for_insert(trans, k, offset, end, &nr_iters);
-               if (ret)
-                       break;
-       }
-err:
-       bch2_trans_iter_exit(trans, &copy);
-       return ret < 0 ? ret : 0;
-}
-
-int bch2_extent_trim_atomic(struct btree_trans *trans,
-                           struct btree_iter *iter,
-                           struct bkey_i *k)
-{
-       struct bpos end = k->k.p;
-       int ret = bch2_extent_atomic_end(trans, iter, &end);
-       if (ret)
-               return ret;
-
-       /* tracepoint */
-
-       if (bpos_lt(end, k->k.p)) {
-               if (trace_extent_trim_atomic_enabled()) {
-                       CLASS(printbuf, buf)();
-                       bch2_bpos_to_text(&buf, end);
-                       prt_newline(&buf);
-                       bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
-                       trace_extent_trim_atomic(trans->c, buf.buf);
-               }
-               bch2_cut_back(end, k);
-       }
-       return 0;
-}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
deleted file mode 100644 (file)
index 34467db..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENT_UPDATE_H
-#define _BCACHEFS_EXTENT_UPDATE_H
-
-#include "bcachefs.h"
-
-int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
-                          struct bpos *);
-int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
-                           struct bkey_i *);
-
-#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
deleted file mode 100644 (file)
index 83cbd77..0000000
+++ /dev/null
@@ -1,1735 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- *
- * Code for managing the extent btree and dynamically updating the writeback
- * dirty sector count.
- */
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "compress.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "journal.h"
-#include "rebalance.h"
-#include "replicas.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-#include "util.h"
-
-static const char * const bch2_extent_flags_strs[] = {
-#define x(n, v)        [BCH_EXTENT_FLAG_##n] = #n,
-       BCH_EXTENT_FLAGS()
-#undef x
-       NULL,
-};
-
-static unsigned bch2_crc_field_size_max[] = {
-       [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
-       [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
-       [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
-};
-
-static void bch2_extent_crc_pack(union bch_extent_crc *,
-                                struct bch_extent_crc_unpacked,
-                                enum bch_extent_entry_type);
-
-void bch2_io_failures_to_text(struct printbuf *out,
-                             struct bch_fs *c,
-                             struct bch_io_failures *failed)
-{
-       static const char * const error_types[] = {
-               "btree validate", "io", "checksum", "ec reconstruct", NULL
-       };
-
-       for (struct bch_dev_io_failures *f = failed->devs;
-            f < failed->devs + failed->nr;
-            f++) {
-               unsigned errflags =
-                       ((!!f->failed_btree_validate)   << 0) |
-                       ((!!f->failed_io)               << 1) |
-                       ((!!f->failed_csum_nr)          << 2) |
-                       ((!!f->failed_ec)               << 3);
-
-               bch2_printbuf_make_room(out, 1024);
-               out->atomic++;
-               scoped_guard(rcu) {
-                       struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
-                       if (ca)
-                               prt_str(out, ca->name);
-                       else
-                               prt_printf(out, "(invalid device %u)", f->dev);
-               }
-               --out->atomic;
-
-               prt_char(out, ' ');
-
-               if (!errflags) {
-                       prt_str(out, "no error - confused");
-               } else if (is_power_of_2(errflags)) {
-                       prt_bitflags(out, error_types, errflags);
-                       prt_str(out, " error");
-               } else {
-                       prt_str(out, "errors: ");
-                       prt_bitflags(out, error_types, errflags);
-               }
-               prt_newline(out);
-       }
-}
-
-struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
-                                                unsigned dev)
-{
-       struct bch_dev_io_failures *i;
-
-       for (i = f->devs; i < f->devs + f->nr; i++)
-               if (i->dev == dev)
-                       return i;
-
-       return NULL;
-}
-
-void bch2_mark_io_failure(struct bch_io_failures *failed,
-                         struct extent_ptr_decoded *p,
-                         bool csum_error)
-{
-       struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
-
-       if (!f) {
-               BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
-
-               f = &failed->devs[failed->nr++];
-               memset(f, 0, sizeof(*f));
-               f->dev = p->ptr.dev;
-       }
-
-       if (p->do_ec_reconstruct)
-               f->failed_ec = true;
-       else if (!csum_error)
-               f->failed_io = true;
-       else
-               f->failed_csum_nr++;
-}
-
-void bch2_mark_btree_validate_failure(struct bch_io_failures *failed,
-                                     unsigned dev)
-{
-       struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev);
-
-       if (!f) {
-               BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
-
-               f = &failed->devs[failed->nr++];
-               memset(f, 0, sizeof(*f));
-               f->dev = dev;
-       }
-
-       f->failed_btree_validate = true;
-}
-
-static inline u64 dev_latency(struct bch_dev *ca)
-{
-       return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
-}
-
-static inline int dev_failed(struct bch_dev *ca)
-{
-       return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
-}
-
-/*
- * returns true if p1 is better than p2:
- */
-static inline bool ptr_better(struct bch_fs *c,
-                             const struct extent_ptr_decoded p1,
-                             u64 p1_latency,
-                             struct bch_dev *ca1,
-                             const struct extent_ptr_decoded p2,
-                             u64 p2_latency)
-{
-       struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
-
-       int failed_delta = dev_failed(ca1) - dev_failed(ca2);
-       if (unlikely(failed_delta))
-               return failed_delta < 0;
-
-       if (static_branch_unlikely(&bch2_force_reconstruct_read))
-               return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
-
-       if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
-               return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
-
-       int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
-       if (unlikely(crc_retry_delta))
-               return crc_retry_delta < 0;
-
-       /* Pick at random, biased in favor of the faster device: */
-
-       return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
-}
-
-/*
- * This picks a non-stale pointer, preferably from a device other than @avoid.
- * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
- * other devices, it will still pick a pointer from avoid.
- */
-int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
-                              struct bch_io_failures *failed,
-                              struct extent_ptr_decoded *pick,
-                              int dev)
-{
-       bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
-       bool have_dirty_ptrs = false, have_pick = false;
-
-       if (k.k->type == KEY_TYPE_error)
-               return bch_err_throw(c, key_type_error);
-
-       rcu_read_lock();
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       u64 pick_latency;
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               have_dirty_ptrs |= !p.ptr.cached;
-
-               /*
-                * Unwritten extent: no need to actually read, treat it as a
-                * hole and return 0s:
-                */
-               if (p.ptr.unwritten) {
-                       rcu_read_unlock();
-                       return 0;
-               }
-
-               /* Are we being asked to read from a specific device? */
-               if (dev >= 0 && p.ptr.dev != dev)
-                       continue;
-
-               struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
-
-               if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) {
-                       rcu_read_unlock();
-                       int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev);
-                       if (ret)
-                               return ret;
-                       rcu_read_lock();
-               }
-
-               if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
-                       continue;
-
-               struct bch_dev_io_failures *f =
-                       unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
-               if (unlikely(f)) {
-                       p.crc_retry_nr     = f->failed_csum_nr;
-                       p.has_ec          &= ~f->failed_ec;
-
-                       if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
-                               have_io_errors  |= f->failed_io;
-                               have_io_errors  |= f->failed_btree_validate;
-                               have_io_errors  |= f->failed_ec;
-                       }
-                       have_csum_errors        |= !!f->failed_csum_nr;
-
-                       if (p.has_ec && (f->failed_io || f->failed_csum_nr))
-                               p.do_ec_reconstruct = true;
-                       else if (f->failed_io ||
-                                f->failed_btree_validate ||
-                                f->failed_csum_nr > c->opts.checksum_err_retry_nr)
-                               continue;
-               }
-
-               have_missing_devs |= ca && !bch2_dev_is_online(ca);
-
-               if (!ca || !bch2_dev_is_online(ca)) {
-                       if (!p.has_ec)
-                               continue;
-                       p.do_ec_reconstruct = true;
-               }
-
-               if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec)
-                       p.do_ec_reconstruct = true;
-
-               u64 p_latency = dev_latency(ca);
-               /*
-                * Square the latencies, to bias more in favor of the faster
-                * device - we never want to stop issuing reads to the slower
-                * device altogether, so that we can update our latency numbers:
-                */
-               p_latency *= p_latency;
-
-               if (!have_pick ||
-                   ptr_better(c,
-                              p, p_latency, ca,
-                              *pick, pick_latency)) {
-                       *pick = p;
-                       pick_latency = p_latency;
-                       have_pick = true;
-               }
-       }
-       rcu_read_unlock();
-
-       if (have_pick)
-               return 1;
-       if (!have_dirty_ptrs)
-               return 0;
-       if (have_missing_devs)
-               return bch_err_throw(c, no_device_to_read_from);
-       if (have_csum_errors)
-               return bch_err_throw(c, data_read_csum_err);
-       if (have_io_errors)
-               return bch_err_throw(c, data_read_io_err);
-
-       /*
-        * If we get here, we have pointers (bkey_ptrs_validate() ensures that),
-        * but they don't point to valid devices:
-        */
-       return bch_err_throw(c, no_devices_valid);
-}
-
-/* KEY_TYPE_btree_ptr: */
-
-int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
-                           struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
-                        c, btree_ptr_val_too_big,
-                        "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
-
-       ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
-       return ret;
-}
-
-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
-                           struct bkey_s_c k)
-{
-       bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
-                              struct bkey_validate_context from)
-{
-       struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
-                        c, btree_ptr_v2_val_too_big,
-                        "value too big (%zu > %zu)",
-                        bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
-
-       bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
-                        c, btree_ptr_v2_min_key_bad,
-                        "min_key > key");
-
-       if ((from.flags & BCH_VALIDATE_write) &&
-           c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
-               bkey_fsck_err_on(!bp.v->sectors_written,
-                                c, btree_ptr_v2_written_0,
-                                "sectors_written == 0");
-
-       ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
-       return ret;
-}
-
-void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
-                              struct bkey_s_c k)
-{
-       struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
-       prt_printf(out, "seq %llx written %u min_key %s",
-              le64_to_cpu(bp.v->seq),
-              le16_to_cpu(bp.v->sectors_written),
-              BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
-
-       bch2_bpos_to_text(out, bp.v->min_key);
-       prt_printf(out, " ");
-       bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
-                             unsigned big_endian, int write,
-                             struct bkey_s k)
-{
-       struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
-
-       compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
-
-       if (version < bcachefs_metadata_version_inode_btree_change &&
-           btree_id_is_extents(btree_id) &&
-           !bkey_eq(bp.v->min_key, POS_MIN))
-               bp.v->min_key = write
-                       ? bpos_nosnap_predecessor(bp.v->min_key)
-                       : bpos_nosnap_successor(bp.v->min_key);
-}
-
-/* KEY_TYPE_extent: */
-
-bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
-       struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
-       struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
-       union bch_extent_entry *en_l;
-       const union bch_extent_entry *en_r;
-       struct extent_ptr_decoded lp, rp;
-       bool use_right_ptr;
-
-       en_l = l_ptrs.start;
-       en_r = r_ptrs.start;
-       while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
-               if (extent_entry_type(en_l) != extent_entry_type(en_r))
-                       return false;
-
-               en_l = extent_entry_next(en_l);
-               en_r = extent_entry_next(en_r);
-       }
-
-       if (en_l < l_ptrs.end || en_r < r_ptrs.end)
-               return false;
-
-       en_l = l_ptrs.start;
-       en_r = r_ptrs.start;
-       lp.crc = bch2_extent_crc_unpack(l.k, NULL);
-       rp.crc = bch2_extent_crc_unpack(r.k, NULL);
-
-       guard(rcu)();
-
-       while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
-              __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
-               if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
-                   rp.ptr.offset + rp.crc.offset ||
-                   lp.ptr.dev                  != rp.ptr.dev ||
-                   lp.ptr.gen                  != rp.ptr.gen ||
-                   lp.ptr.unwritten            != rp.ptr.unwritten ||
-                   lp.has_ec                   != rp.has_ec)
-                       return false;
-
-               /* Extents may not straddle buckets: */
-               struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
-               bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
-
-               if (!same_bucket)
-                       return false;
-
-               if (lp.has_ec                   != rp.has_ec ||
-                   (lp.has_ec &&
-                    (lp.ec.block               != rp.ec.block ||
-                     lp.ec.redundancy          != rp.ec.redundancy ||
-                     lp.ec.idx                 != rp.ec.idx)))
-                       return false;
-
-               if (lp.crc.compression_type     != rp.crc.compression_type ||
-                   lp.crc.nonce                != rp.crc.nonce)
-                       return false;
-
-               if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
-                   lp.crc.uncompressed_size) {
-                       /* can use left extent's crc entry */
-               } else if (lp.crc.live_size <= rp.crc.offset) {
-                       /* can use right extent's crc entry */
-               } else {
-                       /* check if checksums can be merged: */
-                       if (lp.crc.csum_type            != rp.crc.csum_type ||
-                           lp.crc.nonce                != rp.crc.nonce ||
-                           crc_is_compressed(lp.crc) ||
-                           !bch2_checksum_mergeable(lp.crc.csum_type))
-                               return false;
-
-                       if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
-                           rp.crc.offset)
-                               return false;
-
-                       if (lp.crc.csum_type &&
-                           lp.crc.uncompressed_size +
-                           rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
-                               return false;
-               }
-
-               en_l = extent_entry_next(en_l);
-               en_r = extent_entry_next(en_r);
-       }
-
-       en_l = l_ptrs.start;
-       en_r = r_ptrs.start;
-       while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
-               if (extent_entry_is_crc(en_l)) {
-                       struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-                       struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
-                       if (crc_l.uncompressed_size + crc_r.uncompressed_size >
-                           bch2_crc_field_size_max[extent_entry_type(en_l)])
-                               return false;
-               }
-
-               en_l = extent_entry_next(en_l);
-               en_r = extent_entry_next(en_r);
-       }
-
-       use_right_ptr = false;
-       en_l = l_ptrs.start;
-       en_r = r_ptrs.start;
-       while (en_l < l_ptrs.end) {
-               if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
-                   use_right_ptr)
-                       en_l->ptr = en_r->ptr;
-
-               if (extent_entry_is_crc(en_l)) {
-                       struct bch_extent_crc_unpacked crc_l =
-                               bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-                       struct bch_extent_crc_unpacked crc_r =
-                               bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
-                       use_right_ptr = false;
-
-                       if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
-                           crc_l.uncompressed_size) {
-                               /* can use left extent's crc entry */
-                       } else if (crc_l.live_size <= crc_r.offset) {
-                               /* can use right extent's crc entry */
-                               crc_r.offset -= crc_l.live_size;
-                               bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
-                                                    extent_entry_type(en_l));
-                               use_right_ptr = true;
-                       } else {
-                               crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
-                                                                crc_l.csum,
-                                                                crc_r.csum,
-                                                                crc_r.uncompressed_size << 9);
-
-                               crc_l.uncompressed_size += crc_r.uncompressed_size;
-                               crc_l.compressed_size   += crc_r.compressed_size;
-                               bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
-                                                    extent_entry_type(en_l));
-                       }
-               }
-
-               en_l = extent_entry_next(en_l);
-               en_r = extent_entry_next(en_r);
-       }
-
-       bch2_key_resize(l.k, l.k->size + r.k->size);
-       return true;
-}
-
-/* KEY_TYPE_reservation: */
-
-int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
-                             struct bkey_validate_context from)
-{
-       struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-       int ret = 0;
-
-       bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
-                        c, reservation_key_nr_replicas_invalid,
-                        "invalid nr_replicas (%u)", r.v->nr_replicas);
-fsck_err:
-       return ret;
-}
-
-void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
-                             struct bkey_s_c k)
-{
-       struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-       prt_printf(out, "generation %u replicas %u",
-              le32_to_cpu(r.v->generation),
-              r.v->nr_replicas);
-}
-
-bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
-       struct bkey_s_reservation l = bkey_s_to_reservation(_l);
-       struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
-
-       if (l.v->generation != r.v->generation ||
-           l.v->nr_replicas != r.v->nr_replicas)
-               return false;
-
-       bch2_key_resize(l.k, l.k->size + r.k->size);
-       return true;
-}
-
-/* Extent checksum entries: */
-
-/* returns true if not equal */
-static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
-                                        struct bch_extent_crc_unpacked r)
-{
-       return (l.csum_type             != r.csum_type ||
-               l.compression_type      != r.compression_type ||
-               l.compressed_size       != r.compressed_size ||
-               l.uncompressed_size     != r.uncompressed_size ||
-               l.offset                != r.offset ||
-               l.live_size             != r.live_size ||
-               l.nonce                 != r.nonce ||
-               bch2_crc_cmp(l.csum, r.csum));
-}
-
-static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
-                                 struct bch_extent_crc_unpacked n)
-{
-       return !crc_is_compressed(u) &&
-               u.csum_type &&
-               u.uncompressed_size > u.live_size &&
-               bch2_csum_type_is_encryption(u.csum_type) ==
-               bch2_csum_type_is_encryption(n.csum_type);
-}
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
-                                struct bch_extent_crc_unpacked n)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       struct bch_extent_crc_unpacked crc;
-       const union bch_extent_entry *i;
-
-       if (!n.csum_type)
-               return false;
-
-       bkey_for_each_crc(k.k, ptrs, crc, i)
-               if (can_narrow_crc(crc, n))
-                       return true;
-
-       return false;
-}
-
-/*
- * We're writing another replica for this extent, so while we've got the data in
- * memory we'll be computing a new checksum for the currently live data.
- *
- * If there are other replicas we aren't moving, and they are checksummed but
- * not compressed, we can modify them to point to only the data that is
- * currently live (so that readers won't have to bounce) while we've got the
- * checksum we need:
- */
-bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
-{
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-       struct bch_extent_crc_unpacked u;
-       struct extent_ptr_decoded p;
-       union bch_extent_entry *i;
-       bool ret = false;
-
-       /* Find a checksum entry that covers only live data: */
-       if (!n.csum_type) {
-               bkey_for_each_crc(&k->k, ptrs, u, i)
-                       if (!crc_is_compressed(u) &&
-                           u.csum_type &&
-                           u.live_size == u.uncompressed_size) {
-                               n = u;
-                               goto found;
-                       }
-               return false;
-       }
-found:
-       BUG_ON(crc_is_compressed(n));
-       BUG_ON(n.offset);
-       BUG_ON(n.live_size != k->k.size);
-
-restart_narrow_pointers:
-       ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-
-       bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
-               if (can_narrow_crc(p.crc, n)) {
-                       bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
-                       p.ptr.offset += p.crc.offset;
-                       p.crc = n;
-                       bch2_extent_ptr_decoded_append(k, &p);
-                       ret = true;
-                       goto restart_narrow_pointers;
-               }
-
-       return ret;
-}
-
-static void bch2_extent_crc_pack(union bch_extent_crc *dst,
-                                struct bch_extent_crc_unpacked src,
-                                enum bch_extent_entry_type type)
-{
-#define common_fields(_src)                                            \
-               .type                   = BIT(type),                    \
-               .csum_type              = _src.csum_type,               \
-               .compression_type       = _src.compression_type,        \
-               ._compressed_size       = _src.compressed_size - 1,     \
-               ._uncompressed_size     = _src.uncompressed_size - 1,   \
-               .offset                 = _src.offset
-
-       switch (type) {
-       case BCH_EXTENT_ENTRY_crc32:
-               dst->crc32              = (struct bch_extent_crc32) {
-                       common_fields(src),
-                       .csum           = (u32 __force) *((__le32 *) &src.csum.lo),
-               };
-               break;
-       case BCH_EXTENT_ENTRY_crc64:
-               dst->crc64              = (struct bch_extent_crc64) {
-                       common_fields(src),
-                       .nonce          = src.nonce,
-                       .csum_lo        = (u64 __force) src.csum.lo,
-                       .csum_hi        = (u64 __force) *((__le16 *) &src.csum.hi),
-               };
-               break;
-       case BCH_EXTENT_ENTRY_crc128:
-               dst->crc128             = (struct bch_extent_crc128) {
-                       common_fields(src),
-                       .nonce          = src.nonce,
-                       .csum           = src.csum,
-               };
-               break;
-       default:
-               BUG();
-       }
-#undef set_common_fields
-}
-
-void bch2_extent_crc_append(struct bkey_i *k,
-                           struct bch_extent_crc_unpacked new)
-{
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-       union bch_extent_crc *crc = (void *) ptrs.end;
-       enum bch_extent_entry_type type;
-
-       if (bch_crc_bytes[new.csum_type]        <= 4 &&
-           new.uncompressed_size               <= CRC32_SIZE_MAX &&
-           new.nonce                           <= CRC32_NONCE_MAX)
-               type = BCH_EXTENT_ENTRY_crc32;
-       else if (bch_crc_bytes[new.csum_type]   <= 10 &&
-                  new.uncompressed_size        <= CRC64_SIZE_MAX &&
-                  new.nonce                    <= CRC64_NONCE_MAX)
-               type = BCH_EXTENT_ENTRY_crc64;
-       else if (bch_crc_bytes[new.csum_type]   <= 16 &&
-                  new.uncompressed_size        <= CRC128_SIZE_MAX &&
-                  new.nonce                    <= CRC128_NONCE_MAX)
-               type = BCH_EXTENT_ENTRY_crc128;
-       else
-               BUG();
-
-       bch2_extent_crc_pack(crc, new, type);
-
-       k->k.u64s += extent_entry_u64s(ptrs.end);
-
-       EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
-}
-
-/* Generic code for keys with pointers: */
-
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
-{
-       return bch2_bkey_devs(k).nr;
-}
-
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
-{
-       return k.k->type == KEY_TYPE_reservation
-               ? bkey_s_c_to_reservation(k).v->nr_replicas
-               : bch2_bkey_dirty_devs(k).nr;
-}
-
-unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
-{
-       unsigned ret = 0;
-
-       if (k.k->type == KEY_TYPE_reservation) {
-               ret = bkey_s_c_to_reservation(k).v->nr_replicas;
-       } else {
-               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-               const union bch_extent_entry *entry;
-               struct extent_ptr_decoded p;
-
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-                       ret += !p.ptr.cached && !crc_is_compressed(p.crc);
-       }
-
-       return ret;
-}
-
-unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       unsigned ret = 0;
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-               if (!p.ptr.cached && crc_is_compressed(p.crc))
-                       ret += p.crc.compressed_size;
-
-       return ret;
-}
-
-bool bch2_bkey_is_incompressible(struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct bch_extent_crc_unpacked crc;
-
-       bkey_for_each_crc(k.k, ptrs, crc, entry)
-               if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
-                       return true;
-       return false;
-}
-
-unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p = { 0 };
-       unsigned replicas = 0;
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               if (p.ptr.cached)
-                       continue;
-
-               if (p.has_ec)
-                       replicas += p.ec.redundancy;
-
-               replicas++;
-
-       }
-
-       return replicas;
-}
-
-static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
-{
-       if (p->ptr.cached)
-               return 0;
-
-       return p->has_ec
-               ? p->ec.redundancy + 1
-               : ca->mi.durability;
-}
-
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
-{
-       struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
-
-       return ca ? __extent_ptr_durability(ca, p) : 0;
-}
-
-unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
-{
-       struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
-
-       if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
-               return 0;
-
-       return __extent_ptr_durability(ca, p);
-}
-
-unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       unsigned durability = 0;
-
-       guard(rcu)();
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-               durability += bch2_extent_ptr_durability(c, &p);
-       return durability;
-}
-
-static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       unsigned durability = 0;
-
-       guard(rcu)();
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-               if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
-                       durability += bch2_extent_ptr_durability(c, &p);
-       return durability;
-}
-
-void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
-{
-       union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-       union bch_extent_entry *next = extent_entry_next(entry);
-
-       memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
-       k->k.u64s -= extent_entry_u64s(entry);
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *k,
-                                   struct extent_ptr_decoded *p)
-{
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-       struct bch_extent_crc_unpacked crc =
-               bch2_extent_crc_unpack(&k->k, NULL);
-       union bch_extent_entry *pos;
-
-       if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
-               pos = ptrs.start;
-               goto found;
-       }
-
-       bkey_for_each_crc(&k->k, ptrs, crc, pos)
-               if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
-                       pos = extent_entry_next(pos);
-                       goto found;
-               }
-
-       bch2_extent_crc_append(k, p->crc);
-       pos = bkey_val_end(bkey_i_to_s(k));
-found:
-       p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-       __extent_entry_insert(k, pos, to_entry(&p->ptr));
-
-       if (p->has_ec) {
-               p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
-               __extent_entry_insert(k, pos, to_entry(&p->ec));
-       }
-}
-
-static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
-                                         union bch_extent_entry *entry)
-{
-       union bch_extent_entry *i = ptrs.start;
-
-       if (i == entry)
-               return NULL;
-
-       while (extent_entry_next(i) != entry)
-               i = extent_entry_next(i);
-       return i;
-}
-
-/*
- * Returns pointer to the next entry after the one being dropped:
- */
-void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
-{
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-       union bch_extent_entry *entry = to_entry(ptr), *next;
-       bool drop_crc = true;
-
-       if (k.k->type == KEY_TYPE_stripe) {
-               ptr->dev = BCH_SB_MEMBER_INVALID;
-               return;
-       }
-
-       EBUG_ON(ptr < &ptrs.start->ptr ||
-               ptr >= &ptrs.end->ptr);
-       EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
-
-       for (next = extent_entry_next(entry);
-            next != ptrs.end;
-            next = extent_entry_next(next)) {
-               if (extent_entry_is_crc(next)) {
-                       break;
-               } else if (extent_entry_is_ptr(next)) {
-                       drop_crc = false;
-                       break;
-               }
-       }
-
-       extent_entry_drop(k, entry);
-
-       while ((entry = extent_entry_prev(ptrs, entry))) {
-               if (extent_entry_is_ptr(entry))
-                       break;
-
-               if ((extent_entry_is_crc(entry) && drop_crc) ||
-                   extent_entry_is_stripe_ptr(entry))
-                       extent_entry_drop(k, entry);
-       }
-}
-
-void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
-{
-       if (k.k->type != KEY_TYPE_stripe) {
-               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
-               const union bch_extent_entry *entry;
-               struct extent_ptr_decoded p;
-
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-                       if (p.ptr.dev == ptr->dev && p.has_ec) {
-                               ptr->dev = BCH_SB_MEMBER_INVALID;
-                               return;
-                       }
-       }
-
-       bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
-
-       bch2_bkey_drop_ptr_noerror(k, ptr);
-
-       /*
-        * If we deleted all the dirty pointers and there's still cached
-        * pointers, we could set the cached pointers to dirty if they're not
-        * stale - but to do that correctly we'd need to grab an open_bucket
-        * reference so that we don't race with bucket reuse:
-        */
-       if (have_dirty &&
-           !bch2_bkey_dirty_devs(k.s_c).nr) {
-               k.k->type = KEY_TYPE_error;
-               set_bkey_val_u64s(k.k, 0);
-       } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
-               k.k->type = KEY_TYPE_deleted;
-               set_bkey_val_u64s(k.k, 0);
-       }
-}
-
-void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
-{
-       bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
-}
-
-void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
-{
-       bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
-}
-
-const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-       bkey_for_each_ptr(ptrs, ptr)
-               if (ptr->dev == dev)
-                       return ptr;
-
-       return NULL;
-}
-
-bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       struct bch_dev *ca;
-
-       guard(rcu)();
-       bkey_for_each_ptr(ptrs, ptr)
-               if (bch2_dev_in_target(c, ptr->dev, target) &&
-                   (ca = bch2_dev_rcu(c, ptr->dev)) &&
-                   (!ptr->cached ||
-                    !dev_ptr_stale_rcu(ca, ptr)))
-                       return true;
-
-       return false;
-}
-
-bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
-                          struct bch_extent_ptr m, u64 offset)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-               if (p.ptr.dev   == m.dev &&
-                   p.ptr.gen   == m.gen &&
-                   (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
-                   (s64) m.offset  - offset)
-                       return true;
-
-       return false;
-}
-
-/*
- * Returns true if two extents refer to the same data:
- */
-bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
-{
-       if (k1.k->type != k2.k->type)
-               return false;
-
-       if (bkey_extent_is_direct_data(k1.k)) {
-               struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
-               struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
-               const union bch_extent_entry *entry1, *entry2;
-               struct extent_ptr_decoded p1, p2;
-
-               if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
-                       return false;
-
-               bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
-                       bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
-                               if (p1.ptr.dev          == p2.ptr.dev &&
-                                   p1.ptr.gen          == p2.ptr.gen &&
-
-                                   /*
-                                    * This checks that the two pointers point
-                                    * to the same region on disk - adjusting
-                                    * for the difference in where the extents
-                                    * start, since one may have been trimmed:
-                                    */
-                                   (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-                                   (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
-
-                                   /*
-                                    * This additionally checks that the
-                                    * extents overlap on disk, since the
-                                    * previous check may trigger spuriously
-                                    * when one extent is immediately partially
-                                    * overwritten with another extent (so that
-                                    * on disk they are adjacent) and
-                                    * compression is in use:
-                                    */
-                                   ((p1.ptr.offset >= p2.ptr.offset &&
-                                     p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
-                                    (p2.ptr.offset >= p1.ptr.offset &&
-                                     p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
-                                       return true;
-
-               return false;
-       } else {
-               /* KEY_TYPE_deleted, etc. */
-               return true;
-       }
-}
-
-struct bch_extent_ptr *
-bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
-{
-       struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
-       union bch_extent_entry *entry2;
-       struct extent_ptr_decoded p2;
-
-       bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
-               if (p1.ptr.dev          == p2.ptr.dev &&
-                   p1.ptr.gen          == p2.ptr.gen &&
-                   (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-                   (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
-                       return &entry2->ptr;
-
-       return NULL;
-}
-
-static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
-                           struct bch_extent_ptr *ptr)
-{
-       unsigned target = opts->promote_target ?: opts->foreground_target;
-
-       if (target && !bch2_dev_in_target(c, ptr->dev, target))
-               return false;
-
-       struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
-
-       return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
-}
-
-void bch2_extent_ptr_set_cached(struct bch_fs *c,
-                               struct bch_io_opts *opts,
-                               struct bkey_s k,
-                               struct bch_extent_ptr *ptr)
-{
-       struct bkey_ptrs ptrs;
-       union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       bool have_cached_ptr;
-       unsigned drop_dev = ptr->dev;
-
-       guard(rcu)();
-restart_drop_ptrs:
-       ptrs = bch2_bkey_ptrs(k);
-       have_cached_ptr = false;
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               /*
-                * Check if it's erasure coded - stripes can't contain cached
-                * data. Possibly something we can fix in the future?
-                */
-               if (&entry->ptr == ptr && p.has_ec)
-                       goto drop;
-
-               if (p.ptr.cached) {
-                       if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) {
-                               bch2_bkey_drop_ptr_noerror(k, &entry->ptr);
-                               ptr = NULL;
-                               goto restart_drop_ptrs;
-                       }
-
-                       have_cached_ptr = true;
-               }
-       }
-
-       if (!ptr)
-               bkey_for_each_ptr(ptrs, ptr2)
-                       if (ptr2->dev == drop_dev)
-                               ptr = ptr2;
-
-       if (have_cached_ptr || !want_cached_ptr(c, opts, ptr))
-               goto drop;
-
-       ptr->cached = true;
-       return;
-drop:
-       bch2_bkey_drop_ptr_noerror(k, ptr);
-}
-
-/*
- * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
- *
- * Returns true if @k should be dropped entirely
- *
- * For existing keys, only called when btree nodes are being rewritten, not when
- * they're merely being compacted/resorted in memory.
- */
-bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
-{
-       struct bch_dev *ca;
-
-       guard(rcu)();
-       bch2_bkey_drop_ptrs(k, ptr,
-               ptr->cached &&
-               (!(ca = bch2_dev_rcu(c, ptr->dev)) ||
-                dev_ptr_stale_rcu(ca, ptr) > 0));
-
-       return bkey_deleted(k.k);
-}
-
-/*
- * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
- *
- * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
- * the promote target.
- */
-bool bch2_extent_normalize_by_opts(struct bch_fs *c,
-                                  struct bch_io_opts *opts,
-                                  struct bkey_s k)
-{
-       struct bkey_ptrs ptrs;
-       bool have_cached_ptr;
-
-       guard(rcu)();
-restart_drop_ptrs:
-       ptrs = bch2_bkey_ptrs(k);
-       have_cached_ptr = false;
-
-       bkey_for_each_ptr(ptrs, ptr)
-               if (ptr->cached) {
-                       if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
-                               bch2_bkey_drop_ptr(k, ptr);
-                               goto restart_drop_ptrs;
-                       }
-                       have_cached_ptr = true;
-               }
-
-       return bkey_deleted(k.k);
-}
-
-void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
-{
-       out->atomic++;
-       guard(rcu)();
-       struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
-       if (!ca) {
-               prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
-                          (u64) ptr->offset, ptr->gen,
-                          ptr->cached ? " cached" : "");
-       } else {
-               u32 offset;
-               u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
-               prt_printf(out, "ptr: %u:%llu:%u gen %u",
-                          ptr->dev, b, offset, ptr->gen);
-               if (ca->mi.durability != 1)
-                       prt_printf(out, " d=%u", ca->mi.durability);
-               if (ptr->cached)
-                       prt_str(out, " cached");
-               if (ptr->unwritten)
-                       prt_str(out, " unwritten");
-               int stale = dev_ptr_stale_rcu(ca, ptr);
-               if (stale > 0)
-                       prt_printf(out, " stale");
-               else if (stale)
-                       prt_printf(out, " invalid");
-       }
-       --out->atomic;
-}
-
-void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
-{
-       prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
-                  crc->compressed_size,
-                  crc->uncompressed_size,
-                  crc->offset, crc->nonce);
-       bch2_prt_csum_type(out, crc->csum_type);
-       prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
-       prt_str(out, " compress ");
-       bch2_prt_compression_type(out, crc->compression_type);
-}
-
-static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
-                                         const struct bch_extent_rebalance *r)
-{
-       prt_str(out, "rebalance:");
-
-       prt_printf(out, " replicas=%u", r->data_replicas);
-       if (r->data_replicas_from_inode)
-               prt_str(out, " (inode)");
-
-       prt_str(out, " checksum=");
-       bch2_prt_csum_opt(out, r->data_checksum);
-       if (r->data_checksum_from_inode)
-               prt_str(out, " (inode)");
-
-       if (r->background_compression || r->background_compression_from_inode) {
-               prt_str(out, " background_compression=");
-               bch2_compression_opt_to_text(out, r->background_compression);
-
-               if (r->background_compression_from_inode)
-                       prt_str(out, " (inode)");
-       }
-
-       if (r->background_target || r->background_target_from_inode) {
-               prt_str(out, " background_target=");
-               if (c)
-                       bch2_target_to_text(out, c, r->background_target);
-               else
-                       prt_printf(out, "%u", r->background_target);
-
-               if (r->background_target_from_inode)
-                       prt_str(out, " (inode)");
-       }
-
-       if (r->promote_target || r->promote_target_from_inode) {
-               prt_str(out, " promote_target=");
-               if (c)
-                       bch2_target_to_text(out, c, r->promote_target);
-               else
-                       prt_printf(out, "%u", r->promote_target);
-
-               if (r->promote_target_from_inode)
-                       prt_str(out, " (inode)");
-       }
-
-       if (r->erasure_code || r->erasure_code_from_inode) {
-               prt_printf(out, " ec=%u", r->erasure_code);
-               if (r->erasure_code_from_inode)
-                       prt_str(out, " (inode)");
-       }
-}
-
-void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-                           struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       bool first = true;
-
-       if (c)
-               prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
-
-       bkey_extent_entry_for_each(ptrs, entry) {
-               if (!first)
-                       prt_printf(out, " ");
-
-               switch (__extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-                       bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
-                       break;
-
-               case BCH_EXTENT_ENTRY_crc32:
-               case BCH_EXTENT_ENTRY_crc64:
-               case BCH_EXTENT_ENTRY_crc128: {
-                       struct bch_extent_crc_unpacked crc =
-                               bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
-                       bch2_extent_crc_unpacked_to_text(out, &crc);
-                       break;
-               }
-               case BCH_EXTENT_ENTRY_stripe_ptr: {
-                       const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
-
-                       prt_printf(out, "ec: idx %llu block %u",
-                              (u64) ec->idx, ec->block);
-                       break;
-               }
-               case BCH_EXTENT_ENTRY_rebalance:
-                       bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
-                       break;
-
-               case BCH_EXTENT_ENTRY_flags:
-                       prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
-                       break;
-
-               default:
-                       prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
-                       return;
-               }
-
-               first = false;
-       }
-}
-
-static int extent_ptr_validate(struct bch_fs *c,
-                              struct bkey_s_c k,
-                              struct bkey_validate_context from,
-                              const struct bch_extent_ptr *ptr,
-                              unsigned size_ondisk,
-                              bool metadata)
-{
-       int ret = 0;
-
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       bkey_for_each_ptr(ptrs, ptr2)
-               bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
-                                c, ptr_to_duplicate_device,
-                                "multiple pointers to same device (%u)", ptr->dev);
-
-       /* bad pointers are repaired by check_fix_ptrs(): */
-       rcu_read_lock();
-       struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
-       if (!ca) {
-               rcu_read_unlock();
-               return 0;
-       }
-       u32 bucket_offset;
-       u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
-       unsigned first_bucket   = ca->mi.first_bucket;
-       u64 nbuckets            = ca->mi.nbuckets;
-       unsigned bucket_size    = ca->mi.bucket_size;
-       rcu_read_unlock();
-
-       bkey_fsck_err_on(bucket >= nbuckets,
-                        c, ptr_after_last_bucket,
-                        "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
-       bkey_fsck_err_on(bucket < first_bucket,
-                        c, ptr_before_first_bucket,
-                        "pointer before first bucket (%llu < %u)", bucket, first_bucket);
-       bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
-                        c, ptr_spans_multiple_buckets,
-                        "pointer spans multiple buckets (%u + %u > %u)",
-                      bucket_offset, size_ondisk, bucket_size);
-fsck_err:
-       return ret;
-}
-
-int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
-                           struct bkey_validate_context from)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct bch_extent_crc_unpacked crc;
-       unsigned size_ondisk = k.k->size;
-       unsigned nonce = UINT_MAX;
-       unsigned nr_ptrs = 0;
-       bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
-       int ret = 0;
-
-       if (bkey_is_btree_ptr(k.k))
-               size_ondisk = btree_sectors(c);
-
-       bkey_extent_entry_for_each(ptrs, entry) {
-               bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
-                                c, extent_ptrs_invalid_entry,
-                                "invalid extent entry type (got %u, max %u)",
-                                __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
-
-               bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
-                                !extent_entry_is_ptr(entry),
-                                c, btree_ptr_has_non_ptr,
-                                "has non ptr field");
-
-               switch (extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-                       ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
-                       if (ret)
-                               return ret;
-
-                       bkey_fsck_err_on(entry->ptr.cached && have_ec,
-                                        c, ptr_cached_and_erasure_coded,
-                                        "cached, erasure coded ptr");
-
-                       if (!entry->ptr.unwritten)
-                               have_written = true;
-                       else
-                               have_unwritten = true;
-
-                       have_ec = false;
-                       crc_since_last_ptr = false;
-                       nr_ptrs++;
-                       break;
-               case BCH_EXTENT_ENTRY_crc32:
-               case BCH_EXTENT_ENTRY_crc64:
-               case BCH_EXTENT_ENTRY_crc128:
-                       crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
-                       bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
-                                        c, ptr_crc_csum_type_unknown,
-                                        "invalid checksum type");
-                       bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
-                                        c, ptr_crc_compression_type_unknown,
-                                        "invalid compression type");
-
-                       bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
-                                        c, ptr_crc_uncompressed_size_too_small,
-                                        "checksum offset + key size > uncompressed size");
-                       bkey_fsck_err_on(crc_is_encoded(crc) &&
-                                        (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
-                                        (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
-                                        c, ptr_crc_uncompressed_size_too_big,
-                                        "too large encoded extent");
-                       bkey_fsck_err_on(!crc_is_compressed(crc) &&
-                                        crc.compressed_size != crc.uncompressed_size,
-                                        c, ptr_crc_uncompressed_size_mismatch,
-                                        "not compressed but compressed != uncompressed size");
-
-                       if (bch2_csum_type_is_encryption(crc.csum_type)) {
-                               if (nonce == UINT_MAX)
-                                       nonce = crc.offset + crc.nonce;
-                               else if (nonce != crc.offset + crc.nonce)
-                                       bkey_fsck_err(c, ptr_crc_nonce_mismatch,
-                                                     "incorrect nonce");
-                       }
-
-                       bkey_fsck_err_on(crc_since_last_ptr,
-                                        c, ptr_crc_redundant,
-                                        "redundant crc entry");
-                       crc_since_last_ptr = true;
-
-                       size_ondisk = crc.compressed_size;
-                       break;
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       bkey_fsck_err_on(have_ec,
-                                        c, ptr_stripe_redundant,
-                                        "redundant stripe entry");
-                       have_ec = true;
-                       break;
-               case BCH_EXTENT_ENTRY_rebalance: {
-                       /*
-                        * this shouldn't be a fsck error, for forward
-                        * compatibility; the rebalance code should just refetch
-                        * the compression opt if it's unknown
-                        */
-#if 0
-                       const struct bch_extent_rebalance *r = &entry->rebalance;
-
-                       if (!bch2_compression_opt_valid(r->compression)) {
-                               struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
-                               prt_printf(err, "invalid compression opt %u:%u",
-                                          opt.type, opt.level);
-                               return bch_err_throw(c, invalid_bkey);
-                       }
-#endif
-                       break;
-               }
-               case BCH_EXTENT_ENTRY_flags:
-                       bkey_fsck_err_on(entry != ptrs.start,
-                                        c, extent_flags_not_at_start,
-                                        "extent flags entry not at start");
-                       break;
-               }
-       }
-
-       bkey_fsck_err_on(!nr_ptrs,
-                        c, extent_ptrs_no_ptrs,
-                        "no ptrs");
-       bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
-                        c, extent_ptrs_too_many_ptrs,
-                        "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
-       bkey_fsck_err_on(have_written && have_unwritten,
-                        c, extent_ptrs_written_and_unwritten,
-                        "extent with unwritten and written ptrs");
-       bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
-                        c, extent_ptrs_unwritten,
-                        "has unwritten ptrs");
-       bkey_fsck_err_on(crc_since_last_ptr,
-                        c, extent_ptrs_redundant_crc,
-                        "redundant crc entry");
-       bkey_fsck_err_on(have_ec,
-                        c, extent_ptrs_redundant_stripe,
-                        "redundant stripe entry");
-fsck_err:
-       return ret;
-}
-
-void bch2_ptr_swab(struct bkey_s k)
-{
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-       union bch_extent_entry *entry;
-       u64 *d;
-
-       for (d =  (u64 *) ptrs.start;
-            d != (u64 *) ptrs.end;
-            d++)
-               *d = swab64(*d);
-
-       for (entry = ptrs.start;
-            entry < ptrs.end;
-            entry = extent_entry_next(entry)) {
-               switch (__extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-                       break;
-               case BCH_EXTENT_ENTRY_crc32:
-                       entry->crc32.csum = swab32(entry->crc32.csum);
-                       break;
-               case BCH_EXTENT_ENTRY_crc64:
-                       entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
-                       entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
-                       break;
-               case BCH_EXTENT_ENTRY_crc128:
-                       entry->crc128.csum.hi = (__force __le64)
-                               swab64((__force u64) entry->crc128.csum.hi);
-                       entry->crc128.csum.lo = (__force __le64)
-                               swab64((__force u64) entry->crc128.csum.lo);
-                       break;
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       break;
-               case BCH_EXTENT_ENTRY_rebalance:
-                       break;
-               default:
-                       /* Bad entry type: will be caught by validate() */
-                       return;
-               }
-       }
-}
-
-int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
-{
-       int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
-       if (ret)
-               return ret;
-
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-
-       if (ptrs.start != ptrs.end &&
-           extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
-               ptrs.start->flags.flags = flags;
-       } else {
-               struct bch_extent_flags f = {
-                       .type   = BIT(BCH_EXTENT_ENTRY_flags),
-                       .flags  = flags,
-               };
-               __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
-       }
-
-       return 0;
-}
-
-/* Generic extent code: */
-
-int bch2_cut_front_s(struct bpos where, struct bkey_s k)
-{
-       unsigned new_val_u64s = bkey_val_u64s(k.k);
-       int val_u64s_delta;
-       u64 sub;
-
-       if (bkey_le(where, bkey_start_pos(k.k)))
-               return 0;
-
-       EBUG_ON(bkey_gt(where, k.k->p));
-
-       sub = where.offset - bkey_start_offset(k.k);
-
-       k.k->size -= sub;
-
-       if (!k.k->size) {
-               k.k->type = KEY_TYPE_deleted;
-               new_val_u64s = 0;
-       }
-
-       switch (k.k->type) {
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reflink_v: {
-               struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-               union bch_extent_entry *entry;
-               bool seen_crc = false;
-
-               bkey_extent_entry_for_each(ptrs, entry) {
-                       switch (extent_entry_type(entry)) {
-                       case BCH_EXTENT_ENTRY_ptr:
-                               if (!seen_crc)
-                                       entry->ptr.offset += sub;
-                               break;
-                       case BCH_EXTENT_ENTRY_crc32:
-                               entry->crc32.offset += sub;
-                               break;
-                       case BCH_EXTENT_ENTRY_crc64:
-                               entry->crc64.offset += sub;
-                               break;
-                       case BCH_EXTENT_ENTRY_crc128:
-                               entry->crc128.offset += sub;
-                               break;
-                       case BCH_EXTENT_ENTRY_stripe_ptr:
-                       case BCH_EXTENT_ENTRY_rebalance:
-                       case BCH_EXTENT_ENTRY_flags:
-                               break;
-                       }
-
-                       if (extent_entry_is_crc(entry))
-                               seen_crc = true;
-               }
-
-               break;
-       }
-       case KEY_TYPE_reflink_p: {
-               struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
-
-               SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
-               break;
-       }
-       case KEY_TYPE_inline_data:
-       case KEY_TYPE_indirect_inline_data: {
-               void *p = bkey_inline_data_p(k);
-               unsigned bytes = bkey_inline_data_bytes(k.k);
-
-               sub = min_t(u64, sub << 9, bytes);
-
-               memmove(p, p + sub, bytes - sub);
-
-               new_val_u64s -= sub >> 3;
-               break;
-       }
-       }
-
-       val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
-       BUG_ON(val_u64s_delta < 0);
-
-       set_bkey_val_u64s(k.k, new_val_u64s);
-       memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
-       return -val_u64s_delta;
-}
-
-int bch2_cut_back_s(struct bpos where, struct bkey_s k)
-{
-       unsigned new_val_u64s = bkey_val_u64s(k.k);
-       int val_u64s_delta;
-       u64 len = 0;
-
-       if (bkey_ge(where, k.k->p))
-               return 0;
-
-       EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
-
-       len = where.offset - bkey_start_offset(k.k);
-
-       k.k->p.offset = where.offset;
-       k.k->size = len;
-
-       if (!len) {
-               k.k->type = KEY_TYPE_deleted;
-               new_val_u64s = 0;
-       }
-
-       switch (k.k->type) {
-       case KEY_TYPE_inline_data:
-       case KEY_TYPE_indirect_inline_data:
-               new_val_u64s = (bkey_inline_data_offset(k.k) +
-                               min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
-               break;
-       }
-
-       val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
-       BUG_ON(val_u64s_delta < 0);
-
-       set_bkey_val_u64s(k.k, new_val_u64s);
-       memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
-       return -val_u64s_delta;
-}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
deleted file mode 100644 (file)
index b8590e5..0000000
+++ /dev/null
@@ -1,768 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_H
-#define _BCACHEFS_EXTENTS_H
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "extents_types.h"
-
-struct bch_fs;
-struct btree_trans;
-
-/* extent entries: */
-
-#define extent_entry_last(_e)                                          \
-       ((typeof(&(_e).v->start[0])) bkey_val_end(_e))
-
-#define entry_to_ptr(_entry)                                           \
-({                                                                     \
-       EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));              \
-                                                                       \
-       __builtin_choose_expr(                                          \
-               type_is_exact(_entry, const union bch_extent_entry *),  \
-               (const struct bch_extent_ptr *) (_entry),               \
-               (struct bch_extent_ptr *) (_entry));                    \
-})
-
-/* downcast, preserves const */
-#define to_entry(_entry)                                               \
-({                                                                     \
-       BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&        \
-                    !type_is(_entry, struct bch_extent_ptr *) &&       \
-                    !type_is(_entry, struct bch_extent_stripe_ptr *)); \
-                                                                       \
-       __builtin_choose_expr(                                          \
-               (type_is_exact(_entry, const union bch_extent_crc *) || \
-                type_is_exact(_entry, const struct bch_extent_ptr *) ||\
-                type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
-               (const union bch_extent_entry *) (_entry),              \
-               (union bch_extent_entry *) (_entry));                   \
-})
-
-#define extent_entry_next(_entry)                                      \
-       ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
-
-#define extent_entry_next_safe(_entry, _end)                           \
-       (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX)     \
-        ? extent_entry_next(_entry)                                    \
-        : _end)
-
-static inline unsigned
-__extent_entry_type(const union bch_extent_entry *e)
-{
-       return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
-}
-
-static inline enum bch_extent_entry_type
-extent_entry_type(const union bch_extent_entry *e)
-{
-       int ret = __ffs(e->type);
-
-       EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
-
-       return ret;
-}
-
-static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
-{
-       switch (extent_entry_type(entry)) {
-#define x(f, n)                                                \
-       case BCH_EXTENT_ENTRY_##f:                      \
-               return sizeof(struct bch_extent_##f);
-       BCH_EXTENT_ENTRY_TYPES()
-#undef x
-       default:
-               BUG();
-       }
-}
-
-static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
-{
-       return extent_entry_bytes(entry) / sizeof(u64);
-}
-
-static inline void __extent_entry_insert(struct bkey_i *k,
-                                        union bch_extent_entry *dst,
-                                        union bch_extent_entry *new)
-{
-       union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
-       memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
-                             dst, (u64 *) end - (u64 *) dst);
-       k->k.u64s += extent_entry_u64s(new);
-       memcpy_u64s_small(dst, new, extent_entry_u64s(new));
-}
-
-static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
-       union bch_extent_entry *next = extent_entry_next(entry);
-
-       /* stripes have ptrs, but their layout doesn't work with this code */
-       BUG_ON(k.k->type == KEY_TYPE_stripe);
-
-       memmove_u64s_down(entry, next,
-                         (u64 *) bkey_val_end(k) - (u64 *) next);
-       k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
-static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
-{
-       return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
-}
-
-static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
-{
-       return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
-}
-
-static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
-{
-       switch (__extent_entry_type(e)) {
-       case BCH_EXTENT_ENTRY_crc32:
-       case BCH_EXTENT_ENTRY_crc64:
-       case BCH_EXTENT_ENTRY_crc128:
-               return true;
-       default:
-               return false;
-       }
-}
-
-union bch_extent_crc {
-       u8                              type;
-       struct bch_extent_crc32         crc32;
-       struct bch_extent_crc64         crc64;
-       struct bch_extent_crc128        crc128;
-};
-
-#define __entry_to_crc(_entry)                                         \
-       __builtin_choose_expr(                                          \
-               type_is_exact(_entry, const union bch_extent_entry *),  \
-               (const union bch_extent_crc *) (_entry),                \
-               (union bch_extent_crc *) (_entry))
-
-#define entry_to_crc(_entry)                                           \
-({                                                                     \
-       EBUG_ON((_entry) && !extent_entry_is_crc(_entry));              \
-                                                                       \
-       __entry_to_crc(_entry);                                         \
-})
-
-static inline struct bch_extent_crc_unpacked
-bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
-{
-#define common_fields(_crc)                                            \
-               .csum_type              = _crc.csum_type,               \
-               .compression_type       = _crc.compression_type,        \
-               .compressed_size        = _crc._compressed_size + 1,    \
-               .uncompressed_size      = _crc._uncompressed_size + 1,  \
-               .offset                 = _crc.offset,                  \
-               .live_size              = k->size
-
-       if (!crc)
-               return (struct bch_extent_crc_unpacked) {
-                       .compressed_size        = k->size,
-                       .uncompressed_size      = k->size,
-                       .live_size              = k->size,
-               };
-
-       switch (extent_entry_type(to_entry(crc))) {
-       case BCH_EXTENT_ENTRY_crc32: {
-               struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
-                       common_fields(crc->crc32),
-               };
-
-               *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
-               return ret;
-       }
-       case BCH_EXTENT_ENTRY_crc64: {
-               struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
-                       common_fields(crc->crc64),
-                       .nonce                  = crc->crc64.nonce,
-                       .csum.lo                = (__force __le64) crc->crc64.csum_lo,
-               };
-
-               *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
-
-               return ret;
-       }
-       case BCH_EXTENT_ENTRY_crc128: {
-               struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
-                       common_fields(crc->crc128),
-                       .nonce                  = crc->crc128.nonce,
-                       .csum                   = crc->crc128.csum,
-               };
-
-               return ret;
-       }
-       default:
-               BUG();
-       }
-#undef common_fields
-}
-
-static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
-{
-       return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
-               crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
-}
-
-static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
-{
-       return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
-}
-
-void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *);
-
-/* bkey_ptrs: generically over any key type that has ptrs */
-
-struct bkey_ptrs_c {
-       const union bch_extent_entry    *start;
-       const union bch_extent_entry    *end;
-};
-
-struct bkey_ptrs {
-       union bch_extent_entry  *start;
-       union bch_extent_entry  *end;
-};
-
-static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_btree_ptr: {
-               struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
-
-               return (struct bkey_ptrs_c) {
-                       to_entry(&e.v->start[0]),
-                       to_entry(extent_entry_last(e))
-               };
-       }
-       case KEY_TYPE_extent: {
-               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
-               return (struct bkey_ptrs_c) {
-                       e.v->start,
-                       extent_entry_last(e)
-               };
-       }
-       case KEY_TYPE_stripe: {
-               struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-               return (struct bkey_ptrs_c) {
-                       to_entry(&s.v->ptrs[0]),
-                       to_entry(&s.v->ptrs[s.v->nr_blocks]),
-               };
-       }
-       case KEY_TYPE_reflink_v: {
-               struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
-               return (struct bkey_ptrs_c) {
-                       r.v->start,
-                       bkey_val_end(r),
-               };
-       }
-       case KEY_TYPE_btree_ptr_v2: {
-               struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
-
-               return (struct bkey_ptrs_c) {
-                       to_entry(&e.v->start[0]),
-                       to_entry(extent_entry_last(e))
-               };
-       }
-       default:
-               return (struct bkey_ptrs_c) { NULL, NULL };
-       }
-}
-
-static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
-{
-       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
-
-       return (struct bkey_ptrs) {
-               (void *) p.start,
-               (void *) p.end
-       };
-}
-
-#define __bkey_extent_entry_for_each_from(_start, _end, _entry)                \
-       for ((_entry) = (_start);                                       \
-            (_entry) < (_end);                                         \
-            (_entry) = extent_entry_next_safe(_entry, _end))
-
-#define __bkey_ptr_next(_ptr, _end)                                    \
-({                                                                     \
-       typeof(_end) _entry;                                            \
-                                                                       \
-       __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \
-               if (extent_entry_is_ptr(_entry))                        \
-                       break;                                          \
-                                                                       \
-       _entry < (_end) ? entry_to_ptr(_entry) : NULL;                  \
-})
-
-#define bkey_extent_entry_for_each_from(_p, _entry, _start)            \
-       __bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
-
-#define bkey_extent_entry_for_each(_p, _entry)                         \
-       bkey_extent_entry_for_each_from(_p, _entry, _p.start)
-
-#define __bkey_for_each_ptr(_start, _end, _ptr)                                \
-       for (typeof(_start) (_ptr) = (_start);                          \
-            ((_ptr) = __bkey_ptr_next(_ptr, _end));                    \
-            (_ptr)++)
-
-#define bkey_ptr_next(_p, _ptr)                                                \
-       __bkey_ptr_next(_ptr, (_p).end)
-
-#define bkey_for_each_ptr(_p, _ptr)                                    \
-       __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
-
-#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry)                 \
-({                                                                     \
-       __label__ out;                                                  \
-                                                                       \
-       (_ptr).has_ec                   = false;                        \
-       (_ptr).do_ec_reconstruct        = false;                        \
-       (_ptr).crc_retry_nr             = 0;                            \
-                                                                       \
-       __bkey_extent_entry_for_each_from(_entry, _end, _entry)         \
-               switch (__extent_entry_type(_entry)) {                  \
-               case BCH_EXTENT_ENTRY_ptr:                              \
-                       (_ptr).ptr              = _entry->ptr;          \
-                       goto out;                                       \
-               case BCH_EXTENT_ENTRY_crc32:                            \
-               case BCH_EXTENT_ENTRY_crc64:                            \
-               case BCH_EXTENT_ENTRY_crc128:                           \
-                       (_ptr).crc = bch2_extent_crc_unpack(_k,         \
-                                       entry_to_crc(_entry));          \
-                       break;                                          \
-               case BCH_EXTENT_ENTRY_stripe_ptr:                       \
-                       (_ptr).ec = _entry->stripe_ptr;                 \
-                       (_ptr).has_ec   = true;                         \
-                       break;                                          \
-               default:                                                \
-                       /* nothing */                                   \
-                       break;                                          \
-               }                                                       \
-out:                                                                   \
-       _entry < (_end);                                                \
-})
-
-#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry)     \
-       for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),             \
-            (_entry) = _start;                                         \
-            __bkey_ptr_next_decode(_k, _end, _ptr, _entry);            \
-            (_entry) = extent_entry_next_safe(_entry, _end))
-
-#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)                 \
-       __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,            \
-                                  _ptr, _entry)
-
-#define bkey_crc_next(_k, _end, _crc, _iter)                   \
-({                                                                     \
-       __bkey_extent_entry_for_each_from(_iter, _end, _iter)           \
-               if (extent_entry_is_crc(_iter)) {                       \
-                       (_crc) = bch2_extent_crc_unpack(_k,             \
-                                               entry_to_crc(_iter));   \
-                       break;                                          \
-               }                                                       \
-                                                                       \
-       (_iter) < (_end);                                               \
-})
-
-#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)             \
-       for ((_crc) = bch2_extent_crc_unpack(_k, NULL),                 \
-            (_iter) = (_start);                                        \
-            bkey_crc_next(_k, _end, _crc, _iter);              \
-            (_iter) = extent_entry_next(_iter))
-
-#define bkey_for_each_crc(_k, _p, _crc, _iter)                         \
-       __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
-
-/* Iterate over pointers in KEY_TYPE_extent: */
-
-#define extent_ptr_next(_e, _ptr)                                      \
-       __bkey_ptr_next(_ptr, extent_entry_last(_e))
-
-#define extent_for_each_ptr(_e, _ptr)                                  \
-       __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
-
-#define extent_for_each_ptr_decode(_e, _ptr, _entry)                   \
-       __bkey_for_each_ptr_decode((_e).k, (_e).v->start,               \
-                                  extent_entry_last(_e), _ptr, _entry)
-
-/* utility code common to all keys with pointers: */
-
-void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *,
-                             struct bch_io_failures *);
-struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
-                                                unsigned);
-void bch2_mark_io_failure(struct bch_io_failures *,
-                         struct extent_ptr_decoded *, bool);
-void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned);
-int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
-                              struct bch_io_failures *,
-                              struct extent_ptr_decoded *, int);
-
-/* KEY_TYPE_btree_ptr: */
-
-int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
-                           struct bkey_validate_context);
-void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
-                           struct bkey_s_c);
-
-int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
-                              struct bkey_validate_context);
-void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
-                             int, struct bkey_s);
-
-#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) {           \
-       .key_validate   = bch2_btree_ptr_validate,              \
-       .val_to_text    = bch2_btree_ptr_to_text,               \
-       .swab           = bch2_ptr_swab,                        \
-       .trigger        = bch2_trigger_extent,                  \
-})
-
-#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) {                \
-       .key_validate   = bch2_btree_ptr_v2_validate,           \
-       .val_to_text    = bch2_btree_ptr_v2_to_text,            \
-       .swab           = bch2_ptr_swab,                        \
-       .compat         = bch2_btree_ptr_v2_compat,             \
-       .trigger        = bch2_trigger_extent,                  \
-       .min_val_size   = 40,                                   \
-})
-
-/* KEY_TYPE_extent: */
-
-bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-#define bch2_bkey_ops_extent ((struct bkey_ops) {              \
-       .key_validate   = bch2_bkey_ptrs_validate,              \
-       .val_to_text    = bch2_bkey_ptrs_to_text,               \
-       .swab           = bch2_ptr_swab,                        \
-       .key_normalize  = bch2_extent_normalize,                \
-       .key_merge      = bch2_extent_merge,                    \
-       .trigger        = bch2_trigger_extent,                  \
-})
-
-/* KEY_TYPE_reservation: */
-
-int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
-                             struct bkey_validate_context);
-void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-#define bch2_bkey_ops_reservation ((struct bkey_ops) {         \
-       .key_validate   = bch2_reservation_validate,            \
-       .val_to_text    = bch2_reservation_to_text,             \
-       .key_merge      = bch2_reservation_merge,               \
-       .trigger        = bch2_trigger_reservation,             \
-       .min_val_size   = 8,                                    \
-})
-
-/* Extent checksum entries: */
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
-                                struct bch_extent_crc_unpacked);
-bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
-void bch2_extent_crc_append(struct bkey_i *,
-                           struct bch_extent_crc_unpacked);
-
-/* Generic code for keys with pointers: */
-
-static inline bool bkey_is_btree_ptr(const struct bkey *k)
-{
-       switch (k->type) {
-       case KEY_TYPE_btree_ptr:
-       case KEY_TYPE_btree_ptr_v2:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool bkey_extent_is_direct_data(const struct bkey *k)
-{
-       switch (k->type) {
-       case KEY_TYPE_btree_ptr:
-       case KEY_TYPE_btree_ptr_v2:
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reflink_v:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool bkey_extent_is_inline_data(const struct bkey *k)
-{
-       return  k->type == KEY_TYPE_inline_data ||
-               k->type == KEY_TYPE_indirect_inline_data;
-}
-
-static inline unsigned bkey_inline_data_offset(const struct bkey *k)
-{
-       switch (k->type) {
-       case KEY_TYPE_inline_data:
-               return sizeof(struct bch_inline_data);
-       case KEY_TYPE_indirect_inline_data:
-               return sizeof(struct bch_indirect_inline_data);
-       default:
-               BUG();
-       }
-}
-
-static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
-{
-       return bkey_val_bytes(k) - bkey_inline_data_offset(k);
-}
-
-#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k))
-
-static inline bool bkey_extent_is_data(const struct bkey *k)
-{
-       return  bkey_extent_is_direct_data(k) ||
-               bkey_extent_is_inline_data(k) ||
-               k->type == KEY_TYPE_reflink_p;
-}
-
-/*
- * Should extent be counted under inode->i_sectors?
- */
-static inline bool bkey_extent_is_allocation(const struct bkey *k)
-{
-       switch (k->type) {
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reservation:
-       case KEY_TYPE_reflink_p:
-       case KEY_TYPE_reflink_v:
-       case KEY_TYPE_inline_data:
-       case KEY_TYPE_indirect_inline_data:
-       case KEY_TYPE_error:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-       bkey_for_each_ptr(ptrs, ptr)
-               if (ptr->unwritten)
-                       return true;
-       return false;
-}
-
-static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
-{
-       return k.k->type == KEY_TYPE_reservation ||
-               bkey_extent_is_unwritten(k);
-}
-
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
-{
-       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
-       bkey_for_each_ptr(p, ptr)
-               ret.data[ret.nr++] = ptr->dev;
-
-       return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
-{
-       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
-       bkey_for_each_ptr(p, ptr)
-               if (!ptr->cached)
-                       ret.data[ret.nr++] = ptr->dev;
-
-       return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
-{
-       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
-       bkey_for_each_ptr(p, ptr)
-               if (ptr->cached)
-                       ret.data[ret.nr++] = ptr->dev;
-
-       return ret;
-}
-
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
-unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
-bool bch2_bkey_is_incompressible(struct bkey_s_c);
-unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-
-unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *);
-unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
-unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-
-const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
-
-static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
-{
-       return (void *) bch2_bkey_has_device_c(k.s_c, dev);
-}
-
-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
-
-static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
-{
-       struct bch_extent_ptr *dest;
-
-       EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
-
-       switch (k->k.type) {
-       case KEY_TYPE_btree_ptr:
-       case KEY_TYPE_btree_ptr_v2:
-       case KEY_TYPE_extent:
-               EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
-               ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-               dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
-               *dest = ptr;
-               k->k.u64s++;
-               break;
-       default:
-               BUG();
-       }
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *,
-                                   struct extent_ptr_decoded *);
-void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
-void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
-
-void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-
-#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond)                   \
-do {                                                                   \
-       __label__ _again;                                               \
-       struct bkey_ptrs _ptrs;                                         \
-_again:                                                                        \
-       _ptrs = bch2_bkey_ptrs(_k);                                     \
-                                                                       \
-       bkey_for_each_ptr(_ptrs, _ptr)                                  \
-               if (_cond) {                                            \
-                       bch2_bkey_drop_ptr_noerror(_k, _ptr);           \
-                       goto _again;                                    \
-               }                                                       \
-} while (0)
-
-#define bch2_bkey_drop_ptrs(_k, _ptr, _cond)                           \
-do {                                                                   \
-       __label__ _again;                                               \
-       struct bkey_ptrs _ptrs;                                         \
-_again:                                                                        \
-       _ptrs = bch2_bkey_ptrs(_k);                                     \
-                                                                       \
-       bkey_for_each_ptr(_ptrs, _ptr)                                  \
-               if (_cond) {                                            \
-                       bch2_bkey_drop_ptr(_k, _ptr);                   \
-                       goto _again;                                    \
-               }                                                       \
-} while (0)
-
-bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
-                          struct bch_extent_ptr, u64);
-bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
-struct bch_extent_ptr *
-bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
-
-void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *,
-                               struct bkey_s, struct bch_extent_ptr *);
-
-bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s);
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-
-void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
-void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
-                           struct bkey_s_c);
-int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
-                           struct bkey_validate_context);
-
-static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
-                                     struct bch_extent_ptr ptr2)
-{
-       return (ptr1.cached     == ptr2.cached &&
-               ptr1.unwritten  == ptr2.unwritten &&
-               ptr1.offset     == ptr2.offset &&
-               ptr1.dev        == ptr2.dev &&
-               ptr1.gen        == ptr2.gen);
-}
-
-void bch2_ptr_swab(struct bkey_s);
-
-/* Generic extent code: */
-
-enum bch_extent_overlap {
-       BCH_EXTENT_OVERLAP_ALL          = 0,
-       BCH_EXTENT_OVERLAP_BACK         = 1,
-       BCH_EXTENT_OVERLAP_FRONT        = 2,
-       BCH_EXTENT_OVERLAP_MIDDLE       = 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-                                                         const struct bkey *m)
-{
-       int cmp1 = bkey_lt(k->p, m->p);
-       int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
-
-       return (cmp1 << 1) + cmp2;
-}
-
-int bch2_cut_front_s(struct bpos, struct bkey_s);
-int bch2_cut_back_s(struct bpos, struct bkey_s);
-
-static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
-{
-       bch2_cut_front_s(where, bkey_i_to_s(k));
-}
-
-static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
-{
-       bch2_cut_back_s(where, bkey_i_to_s(k));
-}
-
-/**
- * bch_key_resize - adjust size of @k
- *
- * bkey_start_offset(k) will be preserved, modifies where the extent ends
- */
-static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
-{
-       k->p.offset -= k->size;
-       k->p.offset += new_size;
-       k->size = new_size;
-}
-
-static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
-{
-       if (ptrs.start != ptrs.end &&
-           extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
-               return ptrs.start->flags.flags;
-       return 0;
-}
-
-static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
-{
-       return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
-}
-
-int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
-
-#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
deleted file mode 100644 (file)
index 74c0252..0000000
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_FORMAT_H
-#define _BCACHEFS_EXTENTS_FORMAT_H
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32    - 0b1
- * bch_extent_ptr      - 0b10
- * bch_extent_crc64    - 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
-#define BCH_EXTENT_ENTRY_TYPES()               \
-       x(ptr,                  0)              \
-       x(crc32,                1)              \
-       x(crc64,                2)              \
-       x(crc128,               3)              \
-       x(stripe_ptr,           4)              \
-       x(rebalance,            5)              \
-       x(flags,                6)
-#define BCH_EXTENT_ENTRY_MAX   7
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
-       BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u32                   type:2,
-                               _compressed_size:7,
-                               _uncompressed_size:7,
-                               offset:7,
-                               _unused:1,
-                               csum_type:4,
-                               compression_type:4;
-       __u32                   csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u32                   csum;
-       __u32                   compression_type:4,
-                               csum_type:4,
-                               _unused:1,
-                               offset:7,
-                               _uncompressed_size:7,
-                               _compressed_size:7,
-                               type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX         (1U << 7)
-#define CRC32_NONCE_MAX                0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:3,
-                               _compressed_size:9,
-                               _uncompressed_size:9,
-                               offset:9,
-                               nonce:10,
-                               csum_type:4,
-                               compression_type:4,
-                               csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   csum_hi:16,
-                               compression_type:4,
-                               csum_type:4,
-                               nonce:10,
-                               offset:9,
-                               _uncompressed_size:9,
-                               _compressed_size:9,
-                               type:3;
-#endif
-       __u64                   csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX         (1U << 9)
-#define CRC64_NONCE_MAX                ((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:4,
-                               _compressed_size:13,
-                               _uncompressed_size:13,
-                               offset:13,
-                               nonce:13,
-                               csum_type:4,
-                               compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   compression_type:4,
-                               csum_type:4,
-                               nonce:13,
-                               offset:13,
-                               _uncompressed_size:13,
-                               _compressed_size:13,
-                               type:4;
-#endif
-       struct bch_csum         csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX                (1U << 13)
-#define CRC128_NONCE_MAX       ((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:1,
-                               cached:1,
-                               unused:1,
-                               unwritten:1,
-                               offset:44, /* 8 petabytes */
-                               dev:8,
-                               gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   gen:8,
-                               dev:8,
-                               offset:44,
-                               unwritten:1,
-                               unused:1,
-                               cached:1,
-                               type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:5,
-                               block:8,
-                               redundancy:4,
-                               idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   idx:47,
-                               redundancy:4,
-                               block:8,
-                               type:5;
-#endif
-};
-
-#define BCH_EXTENT_FLAGS()             \
-       x(poisoned,             0)
-
-enum bch_extent_flags_e {
-#define x(n, v)        BCH_EXTENT_FLAG_##n = v,
-       BCH_EXTENT_FLAGS()
-#undef x
-};
-
-struct bch_extent_flags {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:7,
-                               flags:57;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   flags:57,
-                               type:7;
-#endif
-};
-
-/* bch_extent_rebalance: */
-#include "rebalance_format.h"
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
-       unsigned long                   type;
-#elif __BITS_PER_LONG == 32
-       struct {
-               unsigned long           pad;
-               unsigned long           type;
-       };
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f  f;
-       BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
-       struct bch_val          v;
-
-       __u64                   _data[0];
-       struct bch_extent_ptr   start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
-       struct bch_val          v;
-
-       __u64                   mem_ptr;
-       __le64                  seq;
-       __le16                  sectors_written;
-       __le16                  flags;
-       struct bpos             min_key;
-       __u64                   _data[0];
-       struct bch_extent_ptr   start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,  struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
-       struct bch_val          v;
-
-       __u64                   _data[0];
-       union bch_extent_entry  start[];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
-       ((sizeof(struct bch_extent_crc128) +                    \
-         sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX                               \
-       (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX           (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX                            \
-       ((sizeof(struct bch_btree_ptr_v2) +                     \
-         sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX                                        \
-       (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-struct bch_reservation {
-       struct bch_val          v;
-
-       __le32                  generation;
-       __u8                    nr_replicas;
-       __u8                    pad[3];
-} __packed __aligned(8);
-
-struct bch_inline_data {
-       struct bch_val          v;
-       u8                      data[];
-};
-
-#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
deleted file mode 100644 (file)
index b23ce4a..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_TYPES_H
-#define _BCACHEFS_EXTENTS_TYPES_H
-
-#include "bcachefs_format.h"
-
-struct bch_extent_crc_unpacked {
-       u32                     compressed_size;
-       u32                     uncompressed_size;
-       u32                     live_size;
-
-       u8                      csum_type;
-       u8                      compression_type;
-
-       u16                     offset;
-
-       u16                     nonce;
-
-       struct bch_csum         csum;
-};
-
-struct extent_ptr_decoded {
-       bool                            has_ec;
-       bool                            do_ec_reconstruct;
-       u8                              crc_retry_nr;
-       struct bch_extent_crc_unpacked  crc;
-       struct bch_extent_ptr           ptr;
-       struct bch_extent_stripe_ptr    ec;
-};
-
-struct bch_io_failures {
-       u8                      nr;
-       struct bch_dev_io_failures {
-               u8              dev;
-               unsigned        failed_csum_nr:6,
-                               failed_io:1,
-                               failed_btree_validate:1,
-                               failed_ec:1;
-       }                       devs[BCH_REPLICAS_MAX + 1];
-};
-
-#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
deleted file mode 100644 (file)
index 0e74255..0000000
+++ /dev/null
@@ -1,315 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "eytzinger.h"
-
-/**
- * is_aligned - is this pointer & size okay for word-wide copying?
- * @base: pointer to data
- * @size: size of each element
- * @align: required alignment (typically 4 or 8)
- *
- * Returns true if elements can be copied using word loads and stores.
- * The size must be a multiple of the alignment, and the base address must
- * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
- *
- * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
- * to "if ((a | b) & mask)", so we do that by hand.
- */
-__attribute_const__ __always_inline
-static bool is_aligned(const void *base, size_t size, unsigned char align)
-{
-       unsigned char lsbits = (unsigned char)size;
-
-       (void)base;
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-       lsbits |= (unsigned char)(uintptr_t)base;
-#endif
-       return (lsbits & (align - 1)) == 0;
-}
-
-/**
- * swap_words_32 - swap two elements in 32-bit chunks
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size (must be a multiple of 4)
- *
- * Exchange the two objects in memory.  This exploits base+index addressing,
- * which basically all CPUs have, to minimize loop overhead computations.
- *
- * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
- * bottom of the loop, even though the zero flag is still valid from the
- * subtract (since the intervening mov instructions don't alter the flags).
- * Gcc 8.1.0 doesn't have that problem.
- */
-static void swap_words_32(void *a, void *b, size_t n)
-{
-       do {
-               u32 t = *(u32 *)(a + (n -= 4));
-               *(u32 *)(a + n) = *(u32 *)(b + n);
-               *(u32 *)(b + n) = t;
-       } while (n);
-}
-
-/**
- * swap_words_64 - swap two elements in 64-bit chunks
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size (must be a multiple of 8)
- *
- * Exchange the two objects in memory.  This exploits base+index
- * addressing, which basically all CPUs have, to minimize loop overhead
- * computations.
- *
- * We'd like to use 64-bit loads if possible.  If they're not, emulating
- * one requires base+index+4 addressing which x86 has but most other
- * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
- * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
- * x32 ABI).  Are there any cases the kernel needs to worry about?
- */
-static void swap_words_64(void *a, void *b, size_t n)
-{
-       do {
-#ifdef CONFIG_64BIT
-               u64 t = *(u64 *)(a + (n -= 8));
-               *(u64 *)(a + n) = *(u64 *)(b + n);
-               *(u64 *)(b + n) = t;
-#else
-               /* Use two 32-bit transfers to avoid base+index+4 addressing */
-               u32 t = *(u32 *)(a + (n -= 4));
-               *(u32 *)(a + n) = *(u32 *)(b + n);
-               *(u32 *)(b + n) = t;
-
-               t = *(u32 *)(a + (n -= 4));
-               *(u32 *)(a + n) = *(u32 *)(b + n);
-               *(u32 *)(b + n) = t;
-#endif
-       } while (n);
-}
-
-/**
- * swap_bytes - swap two elements a byte at a time
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size
- *
- * This is the fallback if alignment doesn't allow using larger chunks.
- */
-static void swap_bytes(void *a, void *b, size_t n)
-{
-       do {
-               char t = ((char *)a)[--n];
-               ((char *)a)[n] = ((char *)b)[n];
-               ((char *)b)[n] = t;
-       } while (n);
-}
-
-/*
- * The values are arbitrary as long as they can't be confused with
- * a pointer, but small integers make for the smallest compare
- * instructions.
- */
-#define SWAP_WORDS_64 (swap_r_func_t)0
-#define SWAP_WORDS_32 (swap_r_func_t)1
-#define SWAP_BYTES    (swap_r_func_t)2
-#define SWAP_WRAPPER  (swap_r_func_t)3
-
-struct wrapper {
-       cmp_func_t cmp;
-       swap_func_t swap_func;
-};
-
-/*
- * The function pointer is last to make tail calls most efficient if the
- * compiler decides not to inline this function.
- */
-static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
-{
-       if (swap_func == SWAP_WRAPPER) {
-               ((const struct wrapper *)priv)->swap_func(a, b, (int)size);
-               return;
-       }
-
-       if (swap_func == SWAP_WORDS_64)
-               swap_words_64(a, b, size);
-       else if (swap_func == SWAP_WORDS_32)
-               swap_words_32(a, b, size);
-       else if (swap_func == SWAP_BYTES)
-               swap_bytes(a, b, size);
-       else
-               swap_func(a, b, (int)size, priv);
-}
-
-#define _CMP_WRAPPER ((cmp_r_func_t)0L)
-
-static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
-{
-       if (cmp == _CMP_WRAPPER)
-               return ((const struct wrapper *)priv)->cmp(a, b);
-       return cmp(a, b, priv);
-}
-
-static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size,
-                        cmp_r_func_t cmp_func, const void *priv,
-                        size_t l, size_t r)
-{
-       return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size,
-                     base1 + inorder_to_eytzinger1(r, n) * size,
-                     cmp_func, priv);
-}
-
-static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size,
-                          swap_r_func_t swap_func, const void *priv,
-                          size_t l, size_t r)
-{
-       do_swap(base1 + inorder_to_eytzinger1(l, n) * size,
-               base1 + inorder_to_eytzinger1(r, n) * size,
-               size, swap_func, priv);
-}
-
-static void eytzinger1_sort_r(void *base1, size_t n, size_t size,
-                             cmp_r_func_t cmp_func,
-                             swap_r_func_t swap_func,
-                             const void *priv)
-{
-       unsigned i, j, k;
-
-       /* called from 'sort' without swap function, let's pick the default */
-       if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
-               swap_func = NULL;
-
-       if (!swap_func) {
-               if (is_aligned(base1, size, 8))
-                       swap_func = SWAP_WORDS_64;
-               else if (is_aligned(base1, size, 4))
-                       swap_func = SWAP_WORDS_32;
-               else
-                       swap_func = SWAP_BYTES;
-       }
-
-       /* heapify */
-       for (i = n / 2; i >= 1; --i) {
-               /* Find the sift-down path all the way to the leaves. */
-               for (j = i; k = j * 2, k < n;)
-                       j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
-
-               /* Special case for the last leaf with no sibling. */
-               if (j * 2 == n)
-                       j *= 2;
-
-               /* Backtrack to the correct location. */
-               while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0)
-                       j /= 2;
-
-               /* Shift the element into its correct place. */
-               for (k = j; j != i;) {
-                       j /= 2;
-                       eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
-               }
-       }
-
-       /* sort */
-       for (i = n; i > 1; --i) {
-               eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i);
-
-               /* Find the sift-down path all the way to the leaves. */
-               for (j = 1; k = j * 2, k + 1 < i;)
-                       j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
-
-               /* Special case for the last leaf with no sibling. */
-               if (j * 2 + 1 == i)
-                       j *= 2;
-
-               /* Backtrack to the correct location. */
-               while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0)
-                       j /= 2;
-
-               /* Shift the element into its correct place. */
-               for (k = j; j > 1;) {
-                       j /= 2;
-                       eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
-               }
-       }
-}
-
-void eytzinger0_sort_r(void *base, size_t n, size_t size,
-                      cmp_r_func_t cmp_func,
-                      swap_r_func_t swap_func,
-                      const void *priv)
-{
-       void *base1 = base - size;
-
-       return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
-                    cmp_func_t cmp_func,
-                    swap_func_t swap_func)
-{
-       struct wrapper w = {
-               .cmp  = cmp_func,
-               .swap_func = swap_func,
-       };
-
-       return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
-}
-
-#if 0
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/ktime.h>
-
-static u64 cmp_count;
-
-static int mycmp(const void *a, const void *b)
-{
-       u32 _a = *(u32 *)a;
-       u32 _b = *(u32 *)b;
-
-       cmp_count++;
-       if (_a < _b)
-               return -1;
-       else if (_a > _b)
-               return 1;
-       else
-               return 0;
-}
-
-static int test(void)
-{
-       size_t N, i;
-       ktime_t start, end;
-       s64 delta;
-       u32 *arr;
-
-       for (N = 10000; N <= 100000; N += 10000) {
-               arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL);
-               cmp_count = 0;
-
-               for (i = 0; i < N; i++)
-                       arr[i] = get_random_u32();
-
-               start = ktime_get();
-               eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL);
-               end = ktime_get();
-
-               delta = ktime_us_delta(end, start);
-               printk(KERN_INFO "time: %lld\n", delta);
-               printk(KERN_INFO "comparisons: %lld\n", cmp_count);
-
-               u32 prev = 0;
-
-               eytzinger0_for_each(i, N) {
-                       if (prev > arr[i])
-                               goto err;
-                       prev = arr[i];
-               }
-
-               kfree(arr);
-       }
-       return 0;
-
-err:
-       kfree(arr);
-       return -1;
-}
-#endif
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
deleted file mode 100644 (file)
index 643c1f7..0000000
+++ /dev/null
@@ -1,300 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _EYTZINGER_H
-#define _EYTZINGER_H
-
-#include <linux/bitops.h>
-#include <linux/log2.h>
-
-#ifdef EYTZINGER_DEBUG
-#include <linux/bug.h>
-#define EYTZINGER_BUG_ON(cond)         BUG_ON(cond)
-#else
-#define EYTZINGER_BUG_ON(cond)
-#endif
-
-/*
- * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array.
- *
- * Consider using an eytzinger tree any time you would otherwise be doing binary
- * search over an array. Binary search is a worst case scenario for branch
- * prediction and prefetching, but in an eytzinger tree every node's children
- * are adjacent in memory, thus we can prefetch children before knowing the
- * result of the comparison, assuming multiple nodes fit on a cacheline.
- *
- * Two variants are provided, for one based indexing and zero based indexing.
- *
- * Zero based indexing is more convenient, but one based indexing has better
- * alignment and thus better performance because each new level of the tree
- * starts at a power of two, and thus if element 0 was cacheline aligned, each
- * new level will be as well.
- */
-
-static inline unsigned eytzinger1_child(unsigned i, unsigned child)
-{
-       EYTZINGER_BUG_ON(child > 1);
-
-       return (i << 1) + child;
-}
-
-static inline unsigned eytzinger1_left_child(unsigned i)
-{
-       return eytzinger1_child(i, 0);
-}
-
-static inline unsigned eytzinger1_right_child(unsigned i)
-{
-       return eytzinger1_child(i, 1);
-}
-
-static inline unsigned eytzinger1_first(unsigned size)
-{
-       return size ? rounddown_pow_of_two(size) : 0;
-}
-
-static inline unsigned eytzinger1_last(unsigned size)
-{
-       return rounddown_pow_of_two(size + 1) - 1;
-}
-
-static inline unsigned eytzinger1_next(unsigned i, unsigned size)
-{
-       EYTZINGER_BUG_ON(i == 0 || i > size);
-
-       if (eytzinger1_right_child(i) <= size) {
-               i = eytzinger1_right_child(i);
-
-               i <<= __fls(size) - __fls(i);
-               i >>= i > size;
-       } else {
-               i >>= ffz(i) + 1;
-       }
-
-       return i;
-}
-
-static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
-{
-       EYTZINGER_BUG_ON(i == 0 || i > size);
-
-       if (eytzinger1_left_child(i) <= size) {
-               i = eytzinger1_left_child(i) + 1;
-
-               i <<= __fls(size) - __fls(i);
-               i -= 1;
-               i >>= i > size;
-       } else {
-               i >>= __ffs(i) + 1;
-       }
-
-       return i;
-}
-
-static inline unsigned eytzinger1_extra(unsigned size)
-{
-       return size
-               ? (size + 1 - rounddown_pow_of_two(size)) << 1
-               : 0;
-}
-
-static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
-                                             unsigned extra)
-{
-       unsigned b = __fls(i);
-       unsigned shift = __fls(size) - b;
-       int s;
-
-       EYTZINGER_BUG_ON(!i || i > size);
-
-       i  ^= 1U << b;
-       i <<= 1;
-       i  |= 1;
-       i <<= shift;
-
-       /*
-        * sign bit trick:
-        *
-        * if (i > extra)
-        *      i -= (i - extra) >> 1;
-        */
-       s = extra - i;
-       i += (s >> 1) & (s >> 31);
-
-       return i;
-}
-
-static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
-                                              unsigned extra)
-{
-       unsigned shift;
-       int s;
-
-       EYTZINGER_BUG_ON(!i || i > size);
-
-       /*
-        * sign bit trick:
-        *
-        * if (i > extra)
-        *      i += i - extra;
-        */
-       s = extra - i;
-       i -= s & (s >> 31);
-
-       shift = __ffs(i);
-
-       i >>= shift + 1;
-       i  |= 1U << (__fls(size) - shift);
-
-       return i;
-}
-
-static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
-{
-       return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
-{
-       return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
-}
-
-#define eytzinger1_for_each(_i, _size)                 \
-       for (unsigned (_i) = eytzinger1_first((_size)); \
-            (_i) != 0;                                 \
-            (_i) = eytzinger1_next((_i), (_size)))
-
-/* Zero based indexing version: */
-
-static inline unsigned eytzinger0_child(unsigned i, unsigned child)
-{
-       EYTZINGER_BUG_ON(child > 1);
-
-       return (i << 1) + 1 + child;
-}
-
-static inline unsigned eytzinger0_left_child(unsigned i)
-{
-       return eytzinger0_child(i, 0);
-}
-
-static inline unsigned eytzinger0_right_child(unsigned i)
-{
-       return eytzinger0_child(i, 1);
-}
-
-static inline unsigned eytzinger0_first(unsigned size)
-{
-       return eytzinger1_first(size) - 1;
-}
-
-static inline unsigned eytzinger0_last(unsigned size)
-{
-       return eytzinger1_last(size) - 1;
-}
-
-static inline unsigned eytzinger0_next(unsigned i, unsigned size)
-{
-       return eytzinger1_next(i + 1, size) - 1;
-}
-
-static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
-{
-       return eytzinger1_prev(i + 1, size) - 1;
-}
-
-static inline unsigned eytzinger0_extra(unsigned size)
-{
-       return eytzinger1_extra(size);
-}
-
-static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
-                                              unsigned extra)
-{
-       return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
-}
-
-static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
-                                              unsigned extra)
-{
-       return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
-}
-
-static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
-{
-       return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
-{
-       return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
-}
-
-#define eytzinger0_for_each(_i, _size)                 \
-       for (unsigned (_i) = eytzinger0_first((_size)); \
-            (_i) != -1;                                \
-            (_i) = eytzinger0_next((_i), (_size)))
-
-#define eytzinger0_for_each_prev(_i, _size)            \
-       for (unsigned (_i) = eytzinger0_last((_size));  \
-            (_i) != -1;                                \
-            (_i) = eytzinger0_prev((_i), (_size)))
-
-/* return greatest node <= @search, or -1 if not found */
-static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
-                                    cmp_func_t cmp, const void *search)
-{
-       void *base1 = base - size;
-       unsigned n = 1;
-
-       while (n <= nr)
-               n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
-       n >>= __ffs(n) + 1;
-       return n - 1;
-}
-
-/* return smallest node > @search, or -1 if not found */
-static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
-                                    cmp_func_t cmp, const void *search)
-{
-       void *base1 = base - size;
-       unsigned n = 1;
-
-       while (n <= nr)
-               n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
-       n >>= __ffs(n + 1) + 1;
-       return n - 1;
-}
-
-/* return smallest node >= @search, or -1 if not found */
-static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
-                                    cmp_func_t cmp, const void *search)
-{
-       void *base1 = base - size;
-       unsigned n = 1;
-
-       while (n <= nr)
-               n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0);
-       n >>= __ffs(n + 1) + 1;
-       return n - 1;
-}
-
-#define eytzinger0_find(base, nr, size, _cmp, search)                  \
-({                                                                     \
-       size_t _size            = (size);                               \
-       void *_base1            = (void *)(base) - _size;               \
-       const void *_search     = (search);                             \
-       size_t _nr              = (nr);                                 \
-       size_t _i               = 1;                                    \
-       int _res;                                                       \
-                                                                       \
-       while (_i <= _nr &&                                             \
-              (_res = _cmp(_search, _base1 + _i * _size)))             \
-               _i = eytzinger1_child(_i, _res > 0);                    \
-       _i - 1;                                                         \
-})
-
-void eytzinger0_sort_r(void *, size_t, size_t,
-                      cmp_r_func_t, swap_r_func_t, const void *);
-void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
-
-#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c
deleted file mode 100644 (file)
index 2faec14..0000000
+++ /dev/null
@@ -1,156 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Fast, unordered lists
- *
- * Supports add, remove, and iterate
- *
- * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot
- * allocation and freeing.
- *
- * This means that adding, removing, and iterating over items is lockless,
- * except when refilling/emptying the percpu slot buffers.
- */
-
-#include "fast_list.h"
-
-struct fast_list_pcpu {
-       u32                     nr;
-       u32                     entries[31];
-};
-
-static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp)
-{
-       int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp);
-       if (unlikely(idx < 0))
-               return 0;
-
-       if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) {
-               ida_free(&l->slots_allocated, idx);
-               return 0;
-       }
-
-       return idx;
-}
-
-/**
- * fast_list_get_idx - get a slot in a fast_list
- * @l:         list to get slot in
- *
- * This allocates a slot in the radix tree without storing to it, so that we can
- * take the potential memory allocation failure early and do the list add later
- * when we can't take an allocation failure.
- *
- * Returns: positive integer on success, -ENOMEM on failure
- */
-int fast_list_get_idx(struct fast_list *l)
-{
-       unsigned long flags;
-       int idx;
-retry:
-       local_irq_save(flags);
-       struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
-
-       if (unlikely(!lp->nr)) {
-               u32 entries[16], nr = 0;
-
-               local_irq_restore(flags);
-               while (nr < ARRAY_SIZE(entries) &&
-                      (idx = fast_list_alloc_idx(l, GFP_KERNEL)))
-                       entries[nr++] = idx;
-               local_irq_save(flags);
-
-               lp = this_cpu_ptr(l->buffer);
-
-               while (nr && lp->nr < ARRAY_SIZE(lp->entries))
-                       lp->entries[lp->nr++] = entries[--nr];
-
-               if (unlikely(nr)) {
-                       local_irq_restore(flags);
-                       while (nr)
-                               ida_free(&l->slots_allocated, entries[--nr]);
-                       goto retry;
-               }
-
-               if (unlikely(!lp->nr)) {
-                       local_irq_restore(flags);
-                       return -ENOMEM;
-               }
-       }
-
-       idx = lp->entries[--lp->nr];
-       local_irq_restore(flags);
-
-       return idx;
-}
-
-/**
- * fast_list_add - add an item to a fast_list
- * @l:         list
- * @item:      item to add
- *
- * Allocates a slot in the radix tree and stores to it and then returns the
- * slot index, which must be passed to fast_list_remove().
- *
- * Returns: positive integer on success, -ENOMEM on failure
- */
-int fast_list_add(struct fast_list *l, void *item)
-{
-       int idx = fast_list_get_idx(l);
-       if (idx < 0)
-               return idx;
-
-       *genradix_ptr_inlined(&l->items, idx) = item;
-       return idx;
-}
-
-/**
- * fast_list_remove - remove an item from a fast_list
- * @l:         list
- * @idx:       item's slot index
- *
- * Zeroes out the slot in the radix tree and frees the slot for future
- * fast_list_add() operations.
- */
-void fast_list_remove(struct fast_list *l, unsigned idx)
-{
-       u32 entries[16], nr = 0;
-       unsigned long flags;
-
-       if (!idx)
-               return;
-
-       *genradix_ptr_inlined(&l->items, idx) = NULL;
-
-       local_irq_save(flags);
-       struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
-
-       if (unlikely(lp->nr == ARRAY_SIZE(lp->entries)))
-               while (nr < ARRAY_SIZE(entries))
-                       entries[nr++] = lp->entries[--lp->nr];
-
-       lp->entries[lp->nr++] = idx;
-       local_irq_restore(flags);
-
-       if (unlikely(nr))
-               while (nr)
-                       ida_free(&l->slots_allocated, entries[--nr]);
-}
-
-void fast_list_exit(struct fast_list *l)
-{
-       /* XXX: warn if list isn't empty */
-       free_percpu(l->buffer);
-       ida_destroy(&l->slots_allocated);
-       genradix_free(&l->items);
-}
-
-int fast_list_init(struct fast_list *l)
-{
-       genradix_init(&l->items);
-       ida_init(&l->slots_allocated);
-       l->buffer = alloc_percpu(*l->buffer);
-       if (!l->buffer)
-               return -ENOMEM;
-       return 0;
-}
diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h
deleted file mode 100644 (file)
index 73c9bf5..0000000
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _LINUX_FAST_LIST_H
-#define _LINUX_FAST_LIST_H
-
-#include <linux/generic-radix-tree.h>
-#include <linux/idr.h>
-#include <linux/percpu.h>
-
-struct fast_list_pcpu;
-
-struct fast_list {
-       GENRADIX(void *)        items;
-       struct ida              slots_allocated;;
-       struct fast_list_pcpu __percpu
-                               *buffer;
-};
-
-static inline void *fast_list_iter_peek(struct genradix_iter *iter,
-                                       struct fast_list *list)
-{
-       void **p;
-       while ((p = genradix_iter_peek(iter, &list->items)) && !*p)
-               genradix_iter_advance(iter, &list->items);
-
-       return p ? *p : NULL;
-}
-
-#define fast_list_for_each_from(_list, _iter, _i, _start)              \
-       for (_iter = genradix_iter_init(&(_list)->items, _start);       \
-            (_i = fast_list_iter_peek(&(_iter), _list)) != NULL;       \
-            genradix_iter_advance(&(_iter), &(_list)->items))
-
-#define fast_list_for_each(_list, _iter, _i)                           \
-       fast_list_for_each_from(_list, _iter, _i, 0)
-
-int fast_list_get_idx(struct fast_list *l);
-int fast_list_add(struct fast_list *l, void *item);
-void fast_list_remove(struct fast_list *l, unsigned idx);
-void fast_list_exit(struct fast_list *l);
-int fast_list_init(struct fast_list *l);
-
-#endif /* _LINUX_FAST_LIST_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
deleted file mode 100644 (file)
index d8153fe..0000000
+++ /dev/null
@@ -1,127 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FIFO_H
-#define _BCACHEFS_FIFO_H
-
-#include "util.h"
-
-#define FIFO(type)                                                     \
-struct {                                                               \
-       size_t front, back, size, mask;                                 \
-       type *data;                                                     \
-}
-
-#define DECLARE_FIFO(type, name)       FIFO(type) name
-
-#define fifo_buf_size(fifo)                                            \
-       ((fifo)->size                                                   \
-        ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])   \
-        : 0)
-
-#define init_fifo(fifo, _size, _gfp)                                   \
-({                                                                     \
-       (fifo)->front   = (fifo)->back = 0;                             \
-       (fifo)->size    = (_size);                                      \
-       (fifo)->mask    = (fifo)->size                                  \
-               ? roundup_pow_of_two((fifo)->size) - 1                  \
-               : 0;                                                    \
-       (fifo)->data    = kvmalloc(fifo_buf_size(fifo), (_gfp));        \
-})
-
-#define free_fifo(fifo)                                                        \
-do {                                                                   \
-       kvfree((fifo)->data);                                           \
-       (fifo)->data = NULL;                                            \
-} while (0)
-
-#define fifo_swap(l, r)                                                        \
-do {                                                                   \
-       swap((l)->front, (r)->front);                                   \
-       swap((l)->back, (r)->back);                                     \
-       swap((l)->size, (r)->size);                                     \
-       swap((l)->mask, (r)->mask);                                     \
-       swap((l)->data, (r)->data);                                     \
-} while (0)
-
-#define fifo_move(dest, src)                                           \
-do {                                                                   \
-       typeof(*((dest)->data)) _t;                                     \
-       while (!fifo_full(dest) &&                                      \
-              fifo_pop(src, _t))                                       \
-               fifo_push(dest, _t);                                    \
-} while (0)
-
-#define fifo_used(fifo)                (((fifo)->back - (fifo)->front))
-#define fifo_free(fifo)                ((fifo)->size - fifo_used(fifo))
-
-#define fifo_empty(fifo)       ((fifo)->front == (fifo)->back)
-#define fifo_full(fifo)                (fifo_used(fifo) == (fifo)->size)
-
-#define fifo_peek_front(fifo)  ((fifo)->data[(fifo)->front & (fifo)->mask])
-#define fifo_peek_back(fifo)   ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
-
-#define fifo_entry_idx_abs(fifo, p)                                    \
-       ((((p) >= &fifo_peek_front(fifo)                                \
-          ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) +           \
-          (((p) - (fifo)->data)))
-
-#define fifo_entry_idx(fifo, p)        (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
-#define fifo_idx_entry(fifo, i)        ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
-
-#define fifo_push_back_ref(f)                                          \
-       (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
-
-#define fifo_push_front_ref(f)                                         \
-       (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
-
-#define fifo_push_back(fifo, new)                                      \
-({                                                                     \
-       typeof((fifo)->data) _r = fifo_push_back_ref(fifo);             \
-       if (_r)                                                         \
-               *_r = (new);                                            \
-       _r != NULL;                                                     \
-})
-
-#define fifo_push_front(fifo, new)                                     \
-({                                                                     \
-       typeof((fifo)->data) _r = fifo_push_front_ref(fifo);            \
-       if (_r)                                                         \
-               *_r = (new);                                            \
-       _r != NULL;                                                     \
-})
-
-#define fifo_pop_front(fifo, i)                                                \
-({                                                                     \
-       bool _r = !fifo_empty((fifo));                                  \
-       if (_r)                                                         \
-               (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];     \
-       _r;                                                             \
-})
-
-#define fifo_pop_back(fifo, i)                                         \
-({                                                                     \
-       bool _r = !fifo_empty((fifo));                                  \
-       if (_r)                                                         \
-               (i) = (fifo)->data[--(fifo)->back & (fifo)->mask];      \
-       _r;                                                             \
-})
-
-#define fifo_push_ref(fifo)    fifo_push_back_ref(fifo)
-#define fifo_push(fifo, i)     fifo_push_back(fifo, (i))
-#define fifo_pop(fifo, i)      fifo_pop_front(fifo, (i))
-#define fifo_peek(fifo)                fifo_peek_front(fifo)
-
-#define fifo_for_each_entry(_entry, _fifo, _iter)                      \
-       for (typecheck(typeof((_fifo)->front), _iter),                  \
-            (_iter) = (_fifo)->front;                                  \
-            ((_iter != (_fifo)->back) &&                               \
-             (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
-            (_iter)++)
-
-#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)                    \
-       for (typecheck(typeof((_fifo)->front), _iter),                  \
-            (_iter) = (_fifo)->front;                                  \
-            ((_iter != (_fifo)->back) &&                               \
-             (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));  \
-            (_iter)++)
-
-#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
deleted file mode 100644 (file)
index 1c54b9b..0000000
+++ /dev/null
@@ -1,1109 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/backing-dev.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
-
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
-       if (bio->bi_vcnt >= bio->bi_max_vecs)
-               return true;
-       if (bio->bi_iter.bi_size > UINT_MAX - len)
-               return true;
-       return false;
-}
-
-/* readpage(s): */
-
-static void bch2_readpages_end_io(struct bio *bio)
-{
-       struct folio_iter fi;
-
-       bio_for_each_folio_all(fi, bio)
-               folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
-
-       bio_put(bio);
-}
-
-struct readpages_iter {
-       struct address_space    *mapping;
-       unsigned                idx;
-       folios                  folios;
-};
-
-static int readpages_iter_init(struct readpages_iter *iter,
-                              struct readahead_control *ractl)
-{
-       struct folio *folio;
-
-       *iter = (struct readpages_iter) { ractl->mapping };
-
-       while ((folio = __readahead_folio(ractl))) {
-               if (!bch2_folio_create(folio, GFP_KERNEL) ||
-                   darray_push(&iter->folios, folio)) {
-                       bch2_folio_release(folio);
-                       ractl->_nr_pages += folio_nr_pages(folio);
-                       ractl->_index -= folio_nr_pages(folio);
-                       return iter->folios.nr ? 0 : -ENOMEM;
-               }
-
-               folio_put(folio);
-       }
-
-       return 0;
-}
-
-static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
-{
-       if (iter->idx >= iter->folios.nr)
-               return NULL;
-       return iter->folios.data[iter->idx];
-}
-
-static inline void readpage_iter_advance(struct readpages_iter *iter)
-{
-       iter->idx++;
-}
-
-static bool extent_partial_reads_expensive(struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       struct bch_extent_crc_unpacked crc;
-       const union bch_extent_entry *i;
-
-       bkey_for_each_crc(k.k, ptrs, crc, i)
-               if (crc.csum_type || crc.compression_type)
-                       return true;
-       return false;
-}
-
-static int readpage_bio_extend(struct btree_trans *trans,
-                              struct readpages_iter *iter,
-                              struct bio *bio,
-                              unsigned sectors_this_extent,
-                              bool get_more)
-{
-       /* Don't hold btree locks while allocating memory: */
-       bch2_trans_unlock(trans);
-
-       while (bio_sectors(bio) < sectors_this_extent &&
-              bio->bi_vcnt < bio->bi_max_vecs) {
-               struct folio *folio = readpage_iter_peek(iter);
-               int ret;
-
-               if (folio) {
-                       readpage_iter_advance(iter);
-               } else {
-                       pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-
-                       if (!get_more)
-                               break;
-
-                       unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio);
-
-                       if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping))
-                               break;
-
-                       unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
-
-                       /* ensure proper alignment */
-                       order = min(order, __ffs(folio_offset|BIT(31)));
-
-                       folio = xa_load(&iter->mapping->i_pages, folio_offset);
-                       if (folio && !xa_is_value(folio))
-                               break;
-
-                       folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order);
-                       if (!folio)
-                               break;
-
-                       if (!__bch2_folio_create(folio, GFP_KERNEL)) {
-                               folio_put(folio);
-                               break;
-                       }
-
-                       ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
-                       if (ret) {
-                               __bch2_folio_release(folio);
-                               folio_put(folio);
-                               break;
-                       }
-
-                       folio_put(folio);
-               }
-
-               BUG_ON(folio_sector(folio) != bio_end_sector(bio));
-
-               BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
-       }
-
-       return bch2_trans_relock(trans);
-}
-
-static void bchfs_read(struct btree_trans *trans,
-                      struct bch_read_bio *rbio,
-                      subvol_inum inum,
-                      struct readpages_iter *readpages_iter)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_buf sk;
-       int flags = BCH_READ_retry_if_stale|
-               BCH_READ_may_promote;
-       int ret = 0;
-
-       rbio->subvol = inum.subvol;
-
-       bch2_bkey_buf_init(&sk);
-       bch2_trans_begin(trans);
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            POS(inum.inum, rbio->bio.bi_iter.bi_sector),
-                            BTREE_ITER_slots);
-       while (1) {
-               struct bkey_s_c k;
-               unsigned bytes, sectors;
-               s64 offset_into_extent;
-               enum btree_id data_btree = BTREE_ID_extents;
-
-               bch2_trans_begin(trans);
-
-               u32 snapshot;
-               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-               if (ret)
-                       goto err;
-
-               bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
-               bch2_btree_iter_set_pos(trans, &iter,
-                               POS(inum.inum, rbio->bio.bi_iter.bi_sector));
-
-               k = bch2_btree_iter_peek_slot(trans, &iter);
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               offset_into_extent = iter.pos.offset -
-                       bkey_start_offset(k.k);
-               sectors = k.k->size - offset_into_extent;
-
-               bch2_bkey_buf_reassemble(&sk, c, k);
-
-               ret = bch2_read_indirect_extent(trans, &data_btree,
-                                       &offset_into_extent, &sk);
-               if (ret)
-                       goto err;
-
-               k = bkey_i_to_s_c(sk.k);
-
-               sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
-
-               if (readpages_iter) {
-                       ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
-                                                 extent_partial_reads_expensive(k));
-                       if (ret)
-                               goto err;
-               }
-
-               bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
-               swap(rbio->bio.bi_iter.bi_size, bytes);
-
-               if (rbio->bio.bi_iter.bi_size == bytes)
-                       flags |= BCH_READ_last_fragment;
-
-               bch2_bio_page_state_set(&rbio->bio, k);
-
-               bch2_read_extent(trans, rbio, iter.pos,
-                                data_btree, k, offset_into_extent, flags);
-               /*
-                * Careful there's a landmine here if bch2_read_extent() ever
-                * starts returning transaction restarts here.
-                *
-                * We've changed rbio->bi_iter.bi_size to be "bytes we can read
-                * from this extent" with the swap call, and we restore it
-                * below. That restore needs to come before checking for
-                * errors.
-                *
-                * But unlike __bch2_read(), we use the rbio bvec iter, not one
-                * on the stack, so we can't do the restore right after the
-                * bch2_read_extent() call: we don't own that iterator anymore
-                * if BCH_READ_last_fragment is set, since we may have submitted
-                * that rbio instead of cloning it.
-                */
-
-               if (flags & BCH_READ_last_fragment)
-                       break;
-
-               swap(rbio->bio.bi_iter.bi_size, bytes);
-               bio_advance(&rbio->bio, bytes);
-err:
-               if (ret &&
-                   !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (ret) {
-               struct printbuf buf = PRINTBUF;
-               lockrestart_do(trans,
-                       bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
-               prt_printf(&buf, "read error %i from btree lookup", ret);
-               bch_err_ratelimited(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-
-               rbio->bio.bi_status = BLK_STS_IOERR;
-               bio_endio(&rbio->bio);
-       }
-
-       bch2_bkey_buf_exit(&sk, c);
-}
-
-void bch2_readahead(struct readahead_control *ractl)
-{
-       struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_io_opts opts;
-       struct folio *folio;
-       struct readpages_iter readpages_iter;
-       struct blk_plug plug;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       int ret = readpages_iter_init(&readpages_iter, ractl);
-       if (ret)
-               return;
-
-       /*
-        * Besides being a general performance optimization, plugging helps with
-        * avoiding btree transaction srcu warnings - submitting a bio can
-        * block, and we don't want todo that with the transaction locked.
-        *
-        * However, plugged bios are submitted when we schedule; we ideally
-        * would have our own scheduler hook to call unlock_long() before
-        * scheduling.
-        */
-       blk_start_plug(&plug);
-       bch2_pagecache_add_get(inode);
-
-       struct btree_trans *trans = bch2_trans_get(c);
-       while ((folio = readpage_iter_peek(&readpages_iter))) {
-               unsigned n = min_t(unsigned,
-                                  readpages_iter.folios.nr -
-                                  readpages_iter.idx,
-                                  BIO_MAX_VECS);
-               struct bch_read_bio *rbio =
-                       rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
-                                                  GFP_KERNEL, &c->bio_read),
-                                 c,
-                                 opts,
-                                 bch2_readpages_end_io);
-
-               readpage_iter_advance(&readpages_iter);
-
-               rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-               BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-               bchfs_read(trans, rbio, inode_inum(inode),
-                          &readpages_iter);
-               bch2_trans_unlock(trans);
-       }
-       bch2_trans_put(trans);
-
-       bch2_pagecache_add_put(inode);
-       blk_finish_plug(&plug);
-       darray_exit(&readpages_iter.folios);
-}
-
-static void bch2_read_single_folio_end_io(struct bio *bio)
-{
-       complete(bio->bi_private);
-}
-
-int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
-{
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_read_bio *rbio;
-       struct bch_io_opts opts;
-       struct blk_plug plug;
-       int ret;
-       DECLARE_COMPLETION_ONSTACK(done);
-
-       BUG_ON(folio_test_uptodate(folio));
-       BUG_ON(folio_test_dirty(folio));
-
-       if (!bch2_folio_create(folio, GFP_KERNEL))
-               return -ENOMEM;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
-                        c,
-                        opts,
-                        bch2_read_single_folio_end_io);
-       rbio->bio.bi_private = &done;
-       rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
-       rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-       BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-       blk_start_plug(&plug);
-       bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
-       blk_finish_plug(&plug);
-       wait_for_completion(&done);
-
-       ret = blk_status_to_errno(rbio->bio.bi_status);
-       bio_put(&rbio->bio);
-
-       if (ret < 0)
-               return ret;
-
-       folio_mark_uptodate(folio);
-       return 0;
-}
-
-int bch2_read_folio(struct file *file, struct folio *folio)
-{
-       int ret;
-
-       ret = bch2_read_single_folio(folio, folio->mapping);
-       folio_unlock(folio);
-       return bch2_err_class(ret);
-}
-
-/* writepages: */
-
-struct bch_writepage_io {
-       struct bch_inode_info           *inode;
-
-       /* must be last: */
-       struct bch_write_op             op;
-};
-
-struct bch_writepage_state {
-       struct bch_writepage_io *io;
-       struct bch_io_opts      opts;
-       struct bch_folio_sector *tmp;
-       unsigned                tmp_sectors;
-       struct blk_plug         plug;
-};
-
-/*
- * Determine when a writepage io is full. We have to limit writepage bios to a
- * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
- * what the bounce path in bch2_write_extent() can handle. In theory we could
- * loosen this restriction for non-bounce I/O, but we don't have that context
- * here. Ideally, we can up this limit and make it configurable in the future
- * when the bounce path can be enhanced to accommodate larger source bios.
- */
-static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
-{
-       struct bio *bio = &io->op.wbio.bio;
-       return bio_full(bio, len) ||
-               (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
-}
-
-static void bch2_writepage_io_done(struct bch_write_op *op)
-{
-       struct bch_writepage_io *io =
-               container_of(op, struct bch_writepage_io, op);
-       struct bch_fs *c = io->op.c;
-       struct bio *bio = &io->op.wbio.bio;
-       struct folio_iter fi;
-       unsigned i;
-
-       if (io->op.error) {
-               set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
-
-               bio_for_each_folio_all(fi, bio) {
-                       struct bch_folio *s;
-
-                       mapping_set_error(fi.folio->mapping, -EIO);
-
-                       s = __bch2_folio(fi.folio);
-                       spin_lock(&s->lock);
-                       for (i = 0; i < folio_sectors(fi.folio); i++)
-                               s->s[i].nr_replicas = 0;
-                       spin_unlock(&s->lock);
-               }
-       }
-
-       if (io->op.flags & BCH_WRITE_wrote_data_inline) {
-               bio_for_each_folio_all(fi, bio) {
-                       struct bch_folio *s;
-
-                       s = __bch2_folio(fi.folio);
-                       spin_lock(&s->lock);
-                       for (i = 0; i < folio_sectors(fi.folio); i++)
-                               s->s[i].nr_replicas = 0;
-                       spin_unlock(&s->lock);
-               }
-       }
-
-       /*
-        * racing with fallocate can cause us to add fewer sectors than
-        * expected - but we shouldn't add more sectors than expected:
-        */
-       WARN_ON_ONCE(io->op.i_sectors_delta > 0);
-
-       /*
-        * (error (due to going RO) halfway through a page can screw that up
-        * slightly)
-        * XXX wtf?
-          BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
-        */
-
-       /*
-        * The writeback flag is effectively our ref on the inode -
-        * fixup i_blocks before calling folio_end_writeback:
-        */
-       bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
-
-       bio_for_each_folio_all(fi, bio) {
-               struct bch_folio *s = __bch2_folio(fi.folio);
-
-               if (atomic_dec_and_test(&s->write_count))
-                       folio_end_writeback(fi.folio);
-       }
-
-       bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_do_io(struct bch_writepage_state *w)
-{
-       struct bch_writepage_io *io = w->io;
-
-       w->io = NULL;
-       closure_call(&io->op.cl, bch2_write, NULL, NULL);
-}
-
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch2_writepage_io_alloc(struct bch_fs *c,
-                                   struct writeback_control *wbc,
-                                   struct bch_writepage_state *w,
-                                   struct bch_inode_info *inode,
-                                   u64 sector,
-                                   unsigned nr_replicas)
-{
-       struct bch_write_op *op;
-
-       w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
-                                             REQ_OP_WRITE,
-                                             GFP_KERNEL,
-                                             &c->writepage_bioset),
-                            struct bch_writepage_io, op.wbio.bio);
-
-       w->io->inode            = inode;
-       op                      = &w->io->op;
-       bch2_write_op_init(op, c, w->opts);
-       op->target              = w->opts.foreground_target;
-       op->nr_replicas         = nr_replicas;
-       op->res.nr_replicas     = nr_replicas;
-       op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
-       op->subvol              = inode->ei_inum.subvol;
-       op->pos                 = POS(inode->v.i_ino, sector);
-       op->end_io              = bch2_writepage_io_done;
-       op->devs_need_flush     = &inode->ei_devs_need_flush;
-       op->wbio.bio.bi_iter.bi_sector = sector;
-       op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
-}
-
-static int __bch2_writepage(struct folio *folio,
-                           struct writeback_control *wbc,
-                           void *data)
-{
-       struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_writepage_state *w = data;
-       struct bch_folio *s;
-       unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
-       loff_t i_size = i_size_read(&inode->v);
-       int ret;
-
-       EBUG_ON(!folio_test_uptodate(folio));
-
-       /* Is the folio fully inside i_size? */
-       if (folio_end_pos(folio) <= i_size)
-               goto do_io;
-
-       /* Is the folio fully outside i_size? (truncate in progress) */
-       if (folio_pos(folio) >= i_size) {
-               folio_unlock(folio);
-               return 0;
-       }
-
-       /*
-        * The folio straddles i_size.  It must be zeroed out on each and every
-        * writepage invocation because it may be mmapped.  "A file is mapped
-        * in multiples of the folio size.  For a file that is not a multiple of
-        * the  folio size, the remaining memory is zeroed when mapped, and
-        * writes to that region are not written out to the file."
-        */
-       folio_zero_segment(folio,
-                          i_size - folio_pos(folio),
-                          folio_size(folio));
-do_io:
-       f_sectors = folio_sectors(folio);
-       s = bch2_folio(folio);
-
-       if (f_sectors > w->tmp_sectors) {
-               kfree(w->tmp);
-               w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
-               w->tmp_sectors = f_sectors;
-       }
-
-       /*
-        * Things get really hairy with errors during writeback:
-        */
-       ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
-       BUG_ON(ret);
-
-       /* Before unlocking the page, get copy of reservations: */
-       spin_lock(&s->lock);
-       memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
-
-       for (i = 0; i < f_sectors; i++) {
-               if (s->s[i].state < SECTOR_dirty)
-                       continue;
-
-               nr_replicas_this_write =
-                       min_t(unsigned, nr_replicas_this_write,
-                             s->s[i].nr_replicas +
-                             s->s[i].replicas_reserved);
-       }
-
-       for (i = 0; i < f_sectors; i++) {
-               if (s->s[i].state < SECTOR_dirty)
-                       continue;
-
-               s->s[i].nr_replicas = w->opts.compression
-                       ? 0 : nr_replicas_this_write;
-
-               s->s[i].replicas_reserved = 0;
-               bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
-       }
-       spin_unlock(&s->lock);
-
-       BUG_ON(atomic_read(&s->write_count));
-       atomic_set(&s->write_count, 1);
-
-       BUG_ON(folio_test_writeback(folio));
-       folio_start_writeback(folio);
-
-       folio_unlock(folio);
-
-       offset = 0;
-       while (1) {
-               unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
-               u64 sector;
-
-               while (offset < f_sectors &&
-                      w->tmp[offset].state < SECTOR_dirty)
-                       offset++;
-
-               if (offset == f_sectors)
-                       break;
-
-               while (offset + sectors < f_sectors &&
-                      w->tmp[offset + sectors].state >= SECTOR_dirty) {
-                       reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
-                       dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
-                       sectors++;
-               }
-               BUG_ON(!sectors);
-
-               sector = folio_sector(folio) + offset;
-
-               if (w->io &&
-                   (w->io->op.res.nr_replicas != nr_replicas_this_write ||
-                    bch_io_full(w->io, sectors << 9) ||
-                    bio_end_sector(&w->io->op.wbio.bio) != sector))
-                       bch2_writepage_do_io(w);
-
-               if (!w->io)
-                       bch2_writepage_io_alloc(c, wbc, w, inode, sector,
-                                               nr_replicas_this_write);
-
-               atomic_inc(&s->write_count);
-
-               BUG_ON(inode != w->io->inode);
-               BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
-                                    sectors << 9, offset << 9));
-
-               w->io->op.res.sectors += reserved_sectors;
-               w->io->op.i_sectors_delta -= dirty_sectors;
-               w->io->op.new_i_size = i_size;
-
-               offset += sectors;
-       }
-
-       if (atomic_dec_and_test(&s->write_count))
-               folio_end_writeback(folio);
-
-       return 0;
-}
-
-int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
-       struct bch_fs *c = mapping->host->i_sb->s_fs_info;
-       struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL);
-
-       bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
-
-       blk_start_plug(&w->plug);
-       int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w);
-       if (w->io)
-               bch2_writepage_do_io(w);
-       blk_finish_plug(&w->plug);
-       kfree(w->tmp);
-       kfree(w);
-       return bch2_err_class(ret);
-}
-
-/* buffered writes: */
-
-int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
-                    loff_t pos, unsigned len,
-                    struct folio **foliop, void **fsdata)
-{
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation *res;
-       struct folio *folio;
-       unsigned offset;
-       int ret = -ENOMEM;
-
-       res = kmalloc(sizeof(*res), GFP_KERNEL);
-       if (!res)
-               return -ENOMEM;
-
-       bch2_folio_reservation_init(c, inode, res);
-       *fsdata = res;
-
-       bch2_pagecache_add_get(inode);
-
-       folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
-                                   FGP_WRITEBEGIN | fgf_set_order(len),
-                                   mapping_gfp_mask(mapping));
-       if (IS_ERR(folio))
-               goto err_unlock;
-
-       offset = pos - folio_pos(folio);
-       len = min_t(size_t, len, folio_end_pos(folio) - pos);
-
-       if (folio_test_uptodate(folio))
-               goto out;
-
-       /* If we're writing entire folio, don't need to read it in first: */
-       if (!offset && len == folio_size(folio))
-               goto out;
-
-       if (!offset && pos + len >= inode->v.i_size) {
-               folio_zero_segment(folio, len, folio_size(folio));
-               flush_dcache_folio(folio);
-               goto out;
-       }
-
-       if (folio_pos(folio) >= inode->v.i_size) {
-               folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
-               flush_dcache_folio(folio);
-               goto out;
-       }
-readpage:
-       ret = bch2_read_single_folio(folio, mapping);
-       if (ret)
-               goto err;
-out:
-       ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-       if (ret)
-               goto err;
-
-       ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
-       if (ret) {
-               if (!folio_test_uptodate(folio)) {
-                       /*
-                        * If the folio hasn't been read in, we won't know if we
-                        * actually need a reservation - we don't actually need
-                        * to read here, we just need to check if the folio is
-                        * fully backed by uncompressed data:
-                        */
-                       goto readpage;
-               }
-
-               goto err;
-       }
-
-       *foliop = folio;
-       return 0;
-err:
-       folio_unlock(folio);
-       folio_put(folio);
-err_unlock:
-       bch2_pagecache_add_put(inode);
-       kfree(res);
-       *fsdata = NULL;
-       return bch2_err_class(ret);
-}
-
-int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping,
-                  loff_t pos, unsigned len, unsigned copied,
-                  struct folio *folio, void *fsdata)
-{
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation *res = fsdata;
-       unsigned offset = pos - folio_pos(folio);
-
-       lockdep_assert_held(&inode->v.i_rwsem);
-       BUG_ON(offset + copied > folio_size(folio));
-
-       if (unlikely(copied < len && !folio_test_uptodate(folio))) {
-               /*
-                * The folio needs to be read in, but that would destroy
-                * our partial write - simplest thing is to just force
-                * userspace to redo the write:
-                */
-               folio_zero_range(folio, 0, folio_size(folio));
-               flush_dcache_folio(folio);
-               copied = 0;
-       }
-
-       spin_lock(&inode->v.i_lock);
-       if (pos + copied > inode->v.i_size)
-               i_size_write(&inode->v, pos + copied);
-       spin_unlock(&inode->v.i_lock);
-
-       if (copied) {
-               if (!folio_test_uptodate(folio))
-                       folio_mark_uptodate(folio);
-
-               bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
-
-               inode->ei_last_dirtied = (unsigned long) current;
-       }
-
-       folio_unlock(folio);
-       folio_put(folio);
-       bch2_pagecache_add_put(inode);
-
-       bch2_folio_reservation_put(c, inode, res);
-       kfree(res);
-
-       return copied;
-}
-
-static noinline void folios_trunc(folios *fs, struct folio **fi)
-{
-       while (fs->data + fs->nr > fi) {
-               struct folio *f = darray_pop(fs);
-
-               folio_unlock(f);
-               folio_put(f);
-       }
-}
-
-static int __bch2_buffered_write(struct bch_inode_info *inode,
-                                struct address_space *mapping,
-                                struct iov_iter *iter,
-                                loff_t pos, unsigned len)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation res;
-       folios fs;
-       struct folio *f;
-       unsigned copied = 0, f_offset, f_copied;
-       u64 end = pos + len, f_pos, f_len;
-       loff_t last_folio_pos = inode->v.i_size;
-       int ret = 0;
-
-       BUG_ON(!len);
-
-       bch2_folio_reservation_init(c, inode, &res);
-       darray_init(&fs);
-
-       ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
-                                              FGP_WRITEBEGIN | fgf_set_order(len),
-                                              mapping_gfp_mask(mapping), &fs);
-       if (ret)
-               goto out;
-
-       BUG_ON(!fs.nr);
-
-       f = darray_first(fs);
-       if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
-               ret = bch2_read_single_folio(f, mapping);
-               if (ret)
-                       goto out;
-       }
-
-       f = darray_last(fs);
-       end = min(end, folio_end_pos(f));
-       last_folio_pos = folio_pos(f);
-       if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
-               if (end >= inode->v.i_size) {
-                       folio_zero_range(f, 0, folio_size(f));
-               } else {
-                       ret = bch2_read_single_folio(f, mapping);
-                       if (ret)
-                               goto out;
-               }
-       }
-
-       ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
-       if (ret)
-               goto out;
-
-       f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(fs));
-       darray_for_each(fs, fi) {
-               ssize_t f_reserved;
-
-               f = *fi;
-               f_len = min(end, folio_end_pos(f)) - f_pos;
-               f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
-
-               if (unlikely(f_reserved != f_len)) {
-                       if (f_reserved < 0) {
-                               if (f == darray_first(fs)) {
-                                       ret = f_reserved;
-                                       goto out;
-                               }
-
-                               folios_trunc(&fs, fi);
-                               end = min(end, folio_end_pos(darray_last(fs)));
-                       } else {
-                               if (!folio_test_uptodate(f)) {
-                                       ret = bch2_read_single_folio(f, mapping);
-                                       if (ret)
-                                               goto out;
-                               }
-
-                               folios_trunc(&fs, fi + 1);
-                               end = f_pos + f_reserved;
-                       }
-
-                       break;
-               }
-
-               f_pos = folio_end_pos(f);
-               f_offset = 0;
-       }
-
-       if (mapping_writably_mapped(mapping))
-               darray_for_each(fs, fi)
-                       flush_dcache_folio(*fi);
-
-       f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(fs));
-       darray_for_each(fs, fi) {
-               f = *fi;
-               f_len = min(end, folio_end_pos(f)) - f_pos;
-               f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
-               if (!f_copied) {
-                       folios_trunc(&fs, fi);
-                       break;
-               }
-
-               if (!folio_test_uptodate(f) &&
-                   f_copied != folio_size(f) &&
-                   pos + copied + f_copied < inode->v.i_size) {
-                       iov_iter_revert(iter, f_copied);
-                       folio_zero_range(f, 0, folio_size(f));
-                       folios_trunc(&fs, fi);
-                       break;
-               }
-
-               flush_dcache_folio(f);
-               copied += f_copied;
-
-               if (f_copied != f_len) {
-                       folios_trunc(&fs, fi + 1);
-                       break;
-               }
-
-               f_pos = folio_end_pos(f);
-               f_offset = 0;
-       }
-
-       if (!copied)
-               goto out;
-
-       end = pos + copied;
-
-       spin_lock(&inode->v.i_lock);
-       if (end > inode->v.i_size)
-               i_size_write(&inode->v, end);
-       spin_unlock(&inode->v.i_lock);
-
-       f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(fs));
-       darray_for_each(fs, fi) {
-               f = *fi;
-               f_len = min(end, folio_end_pos(f)) - f_pos;
-
-               if (!folio_test_uptodate(f))
-                       folio_mark_uptodate(f);
-
-               bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
-
-               f_pos = folio_end_pos(f);
-               f_offset = 0;
-       }
-
-       inode->ei_last_dirtied = (unsigned long) current;
-out:
-       darray_for_each(fs, fi) {
-               folio_unlock(*fi);
-               folio_put(*fi);
-       }
-
-       /*
-        * If the last folio added to the mapping starts beyond current EOF, we
-        * performed a short write but left around at least one post-EOF folio.
-        * Clean up the mapping before we return.
-        */
-       if (last_folio_pos >= inode->v.i_size)
-               truncate_pagecache(&inode->v, inode->v.i_size);
-
-       darray_exit(&fs);
-       bch2_folio_reservation_put(c, inode, &res);
-
-       return copied ?: ret;
-}
-
-static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct file *file = iocb->ki_filp;
-       struct address_space *mapping = file->f_mapping;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       loff_t pos = iocb->ki_pos;
-       ssize_t written = 0;
-       int ret = 0;
-
-       bch2_pagecache_add_get(inode);
-
-       do {
-               unsigned offset = pos & (PAGE_SIZE - 1);
-               unsigned bytes = iov_iter_count(iter);
-again:
-               /*
-                * Bring in the user page that we will copy from _first_.
-                * Otherwise there's a nasty deadlock on copying from the
-                * same page as we're writing to, without it being marked
-                * up-to-date.
-                *
-                * Not only is this an optimisation, but it is also required
-                * to check that the address is actually valid, when atomic
-                * usercopies are used, below.
-                */
-               if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-                       bytes = min_t(unsigned long, iov_iter_count(iter),
-                                     PAGE_SIZE - offset);
-
-                       if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-                               ret = -EFAULT;
-                               break;
-                       }
-               }
-
-               if (unlikely(fatal_signal_pending(current))) {
-                       ret = -EINTR;
-                       break;
-               }
-
-               ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
-               if (unlikely(ret < 0))
-                       break;
-
-               cond_resched();
-
-               if (unlikely(ret == 0)) {
-                       /*
-                        * If we were unable to copy any data at all, we must
-                        * fall back to a single segment length write.
-                        *
-                        * If we didn't fallback here, we could livelock
-                        * because not all segments in the iov can be copied at
-                        * once without a pagefault.
-                        */
-                       bytes = min_t(unsigned long, PAGE_SIZE - offset,
-                                     iov_iter_single_seg_count(iter));
-                       goto again;
-               }
-               pos += ret;
-               written += ret;
-               ret = 0;
-
-               balance_dirty_pages_ratelimited(mapping);
-       } while (iov_iter_count(iter));
-
-       bch2_pagecache_add_put(inode);
-
-       return written ? written : ret;
-}
-
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-       struct file *file = iocb->ki_filp;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       ssize_t ret;
-
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               ret = bch2_direct_write(iocb, from);
-               goto out;
-       }
-
-       inode_lock(&inode->v);
-
-       ret = generic_write_checks(iocb, from);
-       if (ret <= 0)
-               goto unlock;
-
-       ret = file_remove_privs(file);
-       if (ret)
-               goto unlock;
-
-       ret = file_update_time(file);
-       if (ret)
-               goto unlock;
-
-       ret = bch2_buffered_write(iocb, from);
-       if (likely(ret > 0))
-               iocb->ki_pos += ret;
-unlock:
-       inode_unlock(&inode->v);
-
-       if (ret > 0)
-               ret = generic_write_sync(iocb, ret);
-out:
-       return bch2_err_class(ret);
-}
-
-void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
-{
-       bioset_exit(&c->writepage_bioset);
-}
-
-int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
-{
-       if (bioset_init(&c->writepage_bioset,
-                       4, offsetof(struct bch_writepage_io, op.wbio.bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_writepage_bioset_init;
-
-       return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
deleted file mode 100644 (file)
index 14de91c..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_BUFFERED_H
-#define _BCACHEFS_FS_IO_BUFFERED_H
-
-#ifndef NO_BCACHEFS_FS
-
-int bch2_read_single_folio(struct folio *, struct address_space *);
-int bch2_read_folio(struct file *, struct folio *);
-
-int bch2_writepages(struct address_space *, struct writeback_control *);
-void bch2_readahead(struct readahead_control *);
-
-int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos,
-                    unsigned len, struct folio **, void **);
-int bch2_write_end(const struct kiocb *, struct address_space *, loff_t,
-                  unsigned len, unsigned copied, struct folio *, void *);
-
-ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
-
-void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
-int bch2_fs_fs_io_buffered_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
deleted file mode 100644 (file)
index 1f5154d..0000000
+++ /dev/null
@@ -1,704 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "enumerated_ref.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/prefetch.h>
-#include <linux/task_io_accounting_ops.h>
-
-/* O_DIRECT reads */
-
-struct dio_read {
-       struct closure                  cl;
-       struct kiocb                    *req;
-       long                            ret;
-       bool                            should_dirty;
-       struct bch_read_bio             rbio;
-};
-
-static void bio_check_or_release(struct bio *bio, bool check_dirty)
-{
-       if (check_dirty) {
-               bio_check_pages_dirty(bio);
-       } else {
-               bio_release_pages(bio, false);
-               bio_put(bio);
-       }
-}
-
-static CLOSURE_CALLBACK(bch2_dio_read_complete)
-{
-       closure_type(dio, struct dio_read, cl);
-
-       dio->req->ki_complete(dio->req, dio->ret);
-       bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-}
-
-static void bch2_direct_IO_read_endio(struct bio *bio)
-{
-       struct dio_read *dio = bio->bi_private;
-
-       if (bio->bi_status)
-               dio->ret = blk_status_to_errno(bio->bi_status);
-
-       closure_put(&dio->cl);
-}
-
-static void bch2_direct_IO_read_split_endio(struct bio *bio)
-{
-       struct dio_read *dio = bio->bi_private;
-       bool should_dirty = dio->should_dirty;
-
-       bch2_direct_IO_read_endio(bio);
-       bio_check_or_release(bio, should_dirty);
-}
-
-static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
-{
-       struct file *file = req->ki_filp;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_io_opts opts;
-       struct dio_read *dio;
-       struct bio *bio;
-       struct blk_plug plug;
-       loff_t offset = req->ki_pos;
-       bool sync = is_sync_kiocb(req);
-       bool split = false;
-       size_t shorten;
-       ssize_t ret;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       /* bios must be 512 byte aligned: */
-       if ((offset|iter->count) & (SECTOR_SIZE - 1))
-               return -EINVAL;
-
-       ret = min_t(loff_t, iter->count,
-                   max_t(loff_t, 0, i_size_read(&inode->v) - offset));
-
-       if (!ret)
-               return ret;
-
-       shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
-       if (shorten >= iter->count)
-               shorten = 0;
-       iter->count -= shorten;
-
-       bio = bio_alloc_bioset(NULL,
-                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-                              REQ_OP_READ,
-                              GFP_KERNEL,
-                              &c->dio_read_bioset);
-
-       dio = container_of(bio, struct dio_read, rbio.bio);
-       closure_init(&dio->cl, NULL);
-
-       /*
-        * this is a _really_ horrible hack just to avoid an atomic sub at the
-        * end:
-        */
-       if (!sync) {
-               set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
-               atomic_set(&dio->cl.remaining,
-                          CLOSURE_REMAINING_INITIALIZER -
-                          CLOSURE_RUNNING +
-                          CLOSURE_DESTRUCTOR);
-       } else {
-               atomic_set(&dio->cl.remaining,
-                          CLOSURE_REMAINING_INITIALIZER + 1);
-               dio->cl.closure_get_happened = true;
-       }
-
-       dio->req        = req;
-       dio->ret        = ret;
-       /*
-        * This is one of the sketchier things I've encountered: we have to skip
-        * the dirtying of requests that are internal from the kernel (i.e. from
-        * loopback), because we'll deadlock on page_lock.
-        */
-       dio->should_dirty = iter_is_iovec(iter);
-
-       blk_start_plug(&plug);
-
-       goto start;
-       while (iter->count) {
-               split = true;
-
-               bio = bio_alloc_bioset(NULL,
-                                      bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-                                      REQ_OP_READ,
-                                      GFP_KERNEL,
-                                      &c->bio_read);
-start:
-               bio->bi_opf             = REQ_OP_READ|REQ_SYNC;
-               bio->bi_iter.bi_sector  = offset >> 9;
-               bio->bi_private         = dio;
-
-               ret = bio_iov_iter_get_pages(bio, iter);
-               if (ret < 0) {
-                       /* XXX: fault inject this path */
-                       bio->bi_status = BLK_STS_RESOURCE;
-                       bio_endio(bio);
-                       break;
-               }
-
-               offset += bio->bi_iter.bi_size;
-
-               if (dio->should_dirty)
-                       bio_set_pages_dirty(bio);
-
-               if (iter->count)
-                       closure_get(&dio->cl);
-
-               struct bch_read_bio *rbio =
-                       rbio_init(bio,
-                                 c,
-                                 opts,
-                                 split
-                                 ? bch2_direct_IO_read_split_endio
-                                 : bch2_direct_IO_read_endio);
-
-               bch2_read(c, rbio, inode_inum(inode));
-       }
-
-       blk_finish_plug(&plug);
-
-       iter->count += shorten;
-
-       if (sync) {
-               closure_sync(&dio->cl);
-               closure_debug_destroy(&dio->cl);
-               ret = dio->ret;
-               bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-               return ret;
-       } else {
-               return -EIOCBQUEUED;
-       }
-}
-
-ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct file *file = iocb->ki_filp;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct address_space *mapping = file->f_mapping;
-       size_t count = iov_iter_count(iter);
-       ssize_t ret = 0;
-
-       if (!count)
-               return 0; /* skip atime */
-
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               struct blk_plug plug;
-
-               if (unlikely(mapping->nrpages)) {
-                       ret = filemap_write_and_wait_range(mapping,
-                                               iocb->ki_pos,
-                                               iocb->ki_pos + count - 1);
-                       if (ret < 0)
-                               goto out;
-               }
-
-               file_accessed(file);
-
-               blk_start_plug(&plug);
-               ret = bch2_direct_IO_read(iocb, iter);
-               blk_finish_plug(&plug);
-
-               if (ret >= 0)
-                       iocb->ki_pos += ret;
-       } else {
-               bch2_pagecache_add_get(inode);
-               ret = filemap_read(iocb, iter, ret);
-               bch2_pagecache_add_put(inode);
-       }
-out:
-       return bch2_err_class(ret);
-}
-
-/* O_DIRECT writes */
-
-struct dio_write {
-       struct kiocb                    *req;
-       struct address_space            *mapping;
-       struct bch_inode_info           *inode;
-       struct mm_struct                *mm;
-       const struct iovec              *iov;
-       unsigned                        loop:1,
-                                       extending:1,
-                                       sync:1,
-                                       flush:1;
-       struct quota_res                quota_res;
-       u64                             written;
-
-       struct iov_iter                 iter;
-       struct iovec                    inline_vecs[2];
-
-       /* must be last: */
-       struct bch_write_op             op;
-};
-
-static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
-                                      u64 offset, u64 size,
-                                      unsigned nr_replicas, bool compressed)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u64 end = offset + size;
-       u32 snapshot;
-       bool ret = true;
-       int err;
-retry:
-       bch2_trans_begin(trans);
-
-       err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-       if (err)
-               goto err;
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
-                          SPOS(inum.inum, offset, snapshot),
-                          BTREE_ITER_slots, k, err) {
-               if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
-                       break;
-
-               if (k.k->p.snapshot != snapshot ||
-                   nr_replicas > bch2_bkey_replicas(c, k) ||
-                   (!compressed && bch2_bkey_sectors_compressed(k))) {
-                       ret = false;
-                       break;
-               }
-       }
-
-       offset = iter.pos.offset;
-       bch2_trans_iter_exit(trans, &iter);
-err:
-       if (bch2_err_matches(err, BCH_ERR_transaction_restart))
-               goto retry;
-       bch2_trans_put(trans);
-
-       return err ? false : ret;
-}
-
-static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct bch_inode_info *inode = dio->inode;
-       struct bio *bio = &dio->op.wbio.bio;
-
-       return bch2_check_range_allocated(c, inode_inum(inode),
-                               dio->op.pos.offset, bio_sectors(bio),
-                               dio->op.opts.data_replicas,
-                               dio->op.opts.compression != 0);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *);
-static __always_inline long bch2_dio_write_done(struct dio_write *dio);
-
-/*
- * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
- * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
- * caller's stack, we're not guaranteed that it will live for the duration of
- * the IO:
- */
-static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
-{
-       struct iovec *iov = dio->inline_vecs;
-
-       /*
-        * iov_iter has a single embedded iovec - nothing to do:
-        */
-       if (iter_is_ubuf(&dio->iter))
-               return 0;
-
-       /*
-        * We don't currently handle non-iovec iov_iters here - return an error,
-        * and we'll fall back to doing the IO synchronously:
-        */
-       if (!iter_is_iovec(&dio->iter))
-               return -1;
-
-       if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
-               dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
-                                   GFP_KERNEL);
-               if (unlikely(!iov))
-                       return -ENOMEM;
-       }
-
-       memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
-       dio->iter.__iov = iov;
-       return 0;
-}
-
-static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
-{
-       closure_type(dio, struct dio_write, op.cl);
-       struct bch_fs *c = dio->op.c;
-
-       closure_debug_destroy(cl);
-
-       dio->op.error = bch2_journal_error(&c->journal);
-
-       bch2_dio_write_done(dio);
-}
-
-static noinline void bch2_dio_write_flush(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct bch_inode_unpacked inode;
-       int ret;
-
-       dio->flush = 0;
-
-       closure_init(&dio->op.cl, NULL);
-
-       if (!dio->op.error) {
-               ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
-               if (ret) {
-                       dio->op.error = ret;
-               } else {
-                       bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
-                                                    &dio->op.cl);
-                       bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
-               }
-       }
-
-       if (dio->sync) {
-               closure_sync(&dio->op.cl);
-               closure_debug_destroy(&dio->op.cl);
-       } else {
-               continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
-       }
-}
-
-static __always_inline long bch2_dio_write_done(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct kiocb *req = dio->req;
-       struct bch_inode_info *inode = dio->inode;
-       bool sync = dio->sync;
-       long ret;
-
-       if (unlikely(dio->flush)) {
-               bch2_dio_write_flush(dio);
-               if (!sync)
-                       return -EIOCBQUEUED;
-       }
-
-       bch2_pagecache_block_put(inode);
-
-       kfree(dio->iov);
-
-       ret = dio->op.error ?: ((long) dio->written << 9);
-       bio_put(&dio->op.wbio.bio);
-
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
-
-       /* inode->i_dio_count is our ref on inode and thus bch_fs */
-       inode_dio_end(&inode->v);
-
-       if (ret < 0)
-               ret = bch2_err_class(ret);
-
-       if (!sync) {
-               req->ki_complete(req, ret);
-               ret = -EIOCBQUEUED;
-       }
-       return ret;
-}
-
-static __always_inline void bch2_dio_write_end(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct kiocb *req = dio->req;
-       struct bch_inode_info *inode = dio->inode;
-       struct bio *bio = &dio->op.wbio.bio;
-
-       req->ki_pos     += (u64) dio->op.written << 9;
-       dio->written    += dio->op.written;
-
-       if (dio->extending) {
-               spin_lock(&inode->v.i_lock);
-               if (req->ki_pos > inode->v.i_size)
-                       i_size_write(&inode->v, req->ki_pos);
-               spin_unlock(&inode->v.i_lock);
-       }
-
-       if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
-               mutex_lock(&inode->ei_quota_lock);
-               __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
-               __bch2_quota_reservation_put(c, inode, &dio->quota_res);
-               mutex_unlock(&inode->ei_quota_lock);
-       }
-
-       bio_release_pages(bio, false);
-
-       if (unlikely(dio->op.error))
-               set_bit(EI_INODE_ERROR, &inode->ei_flags);
-}
-
-static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct kiocb *req = dio->req;
-       struct address_space *mapping = dio->mapping;
-       struct bch_inode_info *inode = dio->inode;
-       struct bch_io_opts opts;
-       struct bio *bio = &dio->op.wbio.bio;
-       unsigned unaligned, iter_count;
-       bool sync = dio->sync, dropped_locks;
-       long ret;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       while (1) {
-               iter_count = dio->iter.count;
-
-               EBUG_ON(current->faults_disabled_mapping);
-               current->faults_disabled_mapping = mapping;
-
-               ret = bio_iov_iter_get_pages(bio, &dio->iter);
-
-               dropped_locks = fdm_dropped_locks();
-
-               current->faults_disabled_mapping = NULL;
-
-               /*
-                * If the fault handler returned an error but also signalled
-                * that it dropped & retook ei_pagecache_lock, we just need to
-                * re-shoot down the page cache and retry:
-                */
-               if (dropped_locks && ret)
-                       ret = 0;
-
-               if (unlikely(ret < 0))
-                       goto err;
-
-               if (unlikely(dropped_locks)) {
-                       ret = bch2_write_invalidate_inode_pages_range(mapping,
-                                       req->ki_pos,
-                                       req->ki_pos + iter_count - 1);
-                       if (unlikely(ret))
-                               goto err;
-
-                       if (!bio->bi_iter.bi_size)
-                               continue;
-               }
-
-               unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
-               bio->bi_iter.bi_size -= unaligned;
-               iov_iter_revert(&dio->iter, unaligned);
-
-               if (!bio->bi_iter.bi_size) {
-                       /*
-                        * bio_iov_iter_get_pages was only able to get <
-                        * blocksize worth of pages:
-                        */
-                       ret = -EFAULT;
-                       goto err;
-               }
-
-               bch2_write_op_init(&dio->op, c, opts);
-               dio->op.end_io          = sync
-                       ? NULL
-                       : bch2_dio_write_loop_async;
-               dio->op.target          = dio->op.opts.foreground_target;
-               dio->op.write_point     = writepoint_hashed((unsigned long) current);
-               dio->op.nr_replicas     = dio->op.opts.data_replicas;
-               dio->op.subvol          = inode->ei_inum.subvol;
-               dio->op.pos             = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
-               dio->op.devs_need_flush = &inode->ei_devs_need_flush;
-
-               if (sync)
-                       dio->op.flags |= BCH_WRITE_sync;
-               dio->op.flags |= BCH_WRITE_check_enospc;
-
-               ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
-                                                bio_sectors(bio), true);
-               if (unlikely(ret))
-                       goto err;
-
-               ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
-                                               dio->op.opts.data_replicas, 0);
-               if (unlikely(ret) &&
-                   !bch2_dio_write_check_allocated(dio))
-                       goto err;
-
-               task_io_account_write(bio->bi_iter.bi_size);
-
-               if (unlikely(dio->iter.count) &&
-                   !dio->sync &&
-                   !dio->loop &&
-                   bch2_dio_write_copy_iov(dio))
-                       dio->sync = sync = true;
-
-               dio->loop = true;
-               closure_call(&dio->op.cl, bch2_write, NULL, NULL);
-
-               if (!sync)
-                       return -EIOCBQUEUED;
-
-               bch2_dio_write_end(dio);
-
-               if (likely(!dio->iter.count) || dio->op.error)
-                       break;
-
-               bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
-       }
-out:
-       return bch2_dio_write_done(dio);
-err:
-       dio->op.error = ret;
-
-       bio_release_pages(bio, false);
-
-       bch2_quota_reservation_put(c, inode, &dio->quota_res);
-       goto out;
-}
-
-static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
-{
-       struct mm_struct *mm = dio->mm;
-
-       bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
-
-       if (mm)
-               kthread_use_mm(mm);
-       bch2_dio_write_loop(dio);
-       if (mm)
-               kthread_unuse_mm(mm);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *op)
-{
-       struct dio_write *dio = container_of(op, struct dio_write, op);
-
-       bch2_dio_write_end(dio);
-
-       if (likely(!dio->iter.count) || dio->op.error)
-               bch2_dio_write_done(dio);
-       else
-               bch2_dio_write_continue(dio);
-}
-
-ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
-{
-       struct file *file = req->ki_filp;
-       struct address_space *mapping = file->f_mapping;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct dio_write *dio;
-       struct bio *bio;
-       bool locked = true, extending;
-       ssize_t ret;
-
-       prefetch(&c->opts);
-       prefetch((void *) &c->opts + 64);
-       prefetch(&inode->ei_inode);
-       prefetch((void *) &inode->ei_inode + 64);
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write))
-               return -EROFS;
-
-       inode_lock(&inode->v);
-
-       ret = generic_write_checks(req, iter);
-       if (unlikely(ret <= 0))
-               goto err_put_write_ref;
-
-       ret = file_remove_privs(file);
-       if (unlikely(ret))
-               goto err_put_write_ref;
-
-       ret = file_update_time(file);
-       if (unlikely(ret))
-               goto err_put_write_ref;
-
-       if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
-               ret = -EINVAL;
-               goto err_put_write_ref;
-       }
-
-       inode_dio_begin(&inode->v);
-       bch2_pagecache_block_get(inode);
-
-       extending = req->ki_pos + iter->count > inode->v.i_size;
-       if (!extending) {
-               inode_unlock(&inode->v);
-               locked = false;
-       }
-
-       bio = bio_alloc_bioset(NULL,
-                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-                              REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
-                              GFP_KERNEL,
-                              &c->dio_write_bioset);
-       dio = container_of(bio, struct dio_write, op.wbio.bio);
-       dio->req                = req;
-       dio->mapping            = mapping;
-       dio->inode              = inode;
-       dio->mm                 = current->mm;
-       dio->iov                = NULL;
-       dio->loop               = false;
-       dio->extending          = extending;
-       dio->sync               = is_sync_kiocb(req) || extending;
-       dio->flush              = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
-       dio->quota_res.sectors  = 0;
-       dio->written            = 0;
-       dio->iter               = *iter;
-       dio->op.c               = c;
-
-       if (unlikely(mapping->nrpages)) {
-               ret = bch2_write_invalidate_inode_pages_range(mapping,
-                                               req->ki_pos,
-                                               req->ki_pos + iter->count - 1);
-               if (unlikely(ret))
-                       goto err_put_bio;
-       }
-
-       ret = bch2_dio_write_loop(dio);
-out:
-       if (locked)
-               inode_unlock(&inode->v);
-       return ret;
-err_put_bio:
-       bch2_pagecache_block_put(inode);
-       bio_put(bio);
-       inode_dio_end(&inode->v);
-err_put_write_ref:
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
-       goto out;
-}
-
-void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
-{
-       bioset_exit(&c->dio_write_bioset);
-       bioset_exit(&c->dio_read_bioset);
-}
-
-int bch2_fs_fs_io_direct_init(struct bch_fs *c)
-{
-       if (bioset_init(&c->dio_read_bioset,
-                       4, offsetof(struct dio_read, rbio.bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_dio_read_bioset_init;
-
-       if (bioset_init(&c->dio_write_bioset,
-                       4, offsetof(struct dio_write, op.wbio.bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_dio_write_bioset_init;
-
-       return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h
deleted file mode 100644 (file)
index 814621e..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_DIRECT_H
-#define _BCACHEFS_FS_IO_DIRECT_H
-
-#ifndef NO_BCACHEFS_FS
-ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
-ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
-
-void bch2_fs_fs_io_direct_exit(struct bch_fs *);
-int bch2_fs_fs_io_direct_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
deleted file mode 100644 (file)
index c2cc405..0000000
+++ /dev/null
@@ -1,827 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "btree_iter.h"
-#include "extents.h"
-#include "fs-io.h"
-#include "fs-io-pagecache.h"
-#include "subvolume.h"
-
-#include <linux/pagevec.h>
-#include <linux/writeback.h>
-
-int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
-                                    loff_t start, u64 end,
-                                    fgf_t fgp_flags, gfp_t gfp,
-                                    folios *fs)
-{
-       struct folio *f;
-       u64 pos = start;
-       int ret = 0;
-
-       while (pos < end) {
-               if ((u64) pos >= (u64) start + (1ULL << 20))
-                       fgp_flags &= ~FGP_CREAT;
-
-               ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
-               if (ret)
-                       break;
-
-               f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
-               if (IS_ERR(f))
-                       break;
-
-               BUG_ON(fs->nr && folio_pos(f) != pos);
-
-               pos = folio_end_pos(f);
-               darray_push(fs, f);
-       }
-
-       if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
-               ret = -ENOMEM;
-
-       return fs->nr ? 0 : ret;
-}
-
-/* pagecache_block must be held */
-int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
-                                           loff_t start, loff_t end)
-{
-       int ret;
-
-       /*
-        * XXX: the way this is currently implemented, we can spin if a process
-        * is continually redirtying a specific page
-        */
-       do {
-               if (!mapping->nrpages)
-                       return 0;
-
-               ret = filemap_write_and_wait_range(mapping, start, end);
-               if (ret)
-                       break;
-
-               if (!mapping->nrpages)
-                       return 0;
-
-               ret = invalidate_inode_pages2_range(mapping,
-                               start >> PAGE_SHIFT,
-                               end >> PAGE_SHIFT);
-       } while (ret == -EBUSY);
-
-       return ret;
-}
-
-#if 0
-/* Useful for debug tracing: */
-static const char * const bch2_folio_sector_states[] = {
-#define x(n)   #n,
-       BCH_FOLIO_SECTOR_STATE()
-#undef x
-       NULL
-};
-#endif
-
-static inline enum bch_folio_sector_state
-folio_sector_dirty(enum bch_folio_sector_state state)
-{
-       switch (state) {
-       case SECTOR_unallocated:
-               return SECTOR_dirty;
-       case SECTOR_reserved:
-               return SECTOR_dirty_reserved;
-       default:
-               return state;
-       }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_undirty(enum bch_folio_sector_state state)
-{
-       switch (state) {
-       case SECTOR_dirty:
-               return SECTOR_unallocated;
-       case SECTOR_dirty_reserved:
-               return SECTOR_reserved;
-       default:
-               return state;
-       }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_reserve(enum bch_folio_sector_state state)
-{
-       switch (state) {
-       case SECTOR_unallocated:
-               return SECTOR_reserved;
-       case SECTOR_dirty:
-               return SECTOR_dirty_reserved;
-       default:
-               return state;
-       }
-}
-
-/* for newly allocated folios: */
-struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-       struct bch_folio *s;
-
-       s = kzalloc(sizeof(*s) +
-                   sizeof(struct bch_folio_sector) *
-                   folio_sectors(folio), gfp);
-       if (!s)
-               return NULL;
-
-       spin_lock_init(&s->lock);
-       folio_attach_private(folio, s);
-       return s;
-}
-
-struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-       return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
-}
-
-static unsigned bkey_to_sector_state(struct bkey_s_c k)
-{
-       if (bkey_extent_is_reservation(k))
-               return SECTOR_reserved;
-       if (bkey_extent_is_allocation(k.k))
-               return SECTOR_allocated;
-       return SECTOR_unallocated;
-}
-
-static void __bch2_folio_set(struct folio *folio,
-                            unsigned pg_offset, unsigned pg_len,
-                            unsigned nr_ptrs, unsigned state)
-{
-       struct bch_folio *s = bch2_folio(folio);
-       unsigned i, sectors = folio_sectors(folio);
-
-       BUG_ON(pg_offset >= sectors);
-       BUG_ON(pg_offset + pg_len > sectors);
-
-       spin_lock(&s->lock);
-
-       for (i = pg_offset; i < pg_offset + pg_len; i++) {
-               s->s[i].nr_replicas     = nr_ptrs;
-               bch2_folio_sector_set(folio, s, i, state);
-       }
-
-       if (i == sectors)
-               s->uptodate = true;
-
-       spin_unlock(&s->lock);
-}
-
-/*
- * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
- * extents btree:
- */
-int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-                  struct folio **fs, unsigned nr_folios)
-{
-       u64 offset = folio_sector(fs[0]);
-       bool need_set = false;
-
-       for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-               struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
-               if (!s)
-                       return -ENOMEM;
-
-               need_set |= !s->uptodate;
-       }
-
-       if (!need_set)
-               return 0;
-
-       unsigned folio_idx = 0;
-
-       return bch2_trans_run(c,
-               for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
-                                  POS(inum.inum, offset),
-                                  POS(inum.inum, U64_MAX),
-                                  inum.subvol, BTREE_ITER_slots, k, ({
-                       unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
-                       unsigned state = bkey_to_sector_state(k);
-
-                       while (folio_idx < nr_folios) {
-                               struct folio *folio = fs[folio_idx];
-                               u64 folio_start = folio_sector(folio);
-                               u64 folio_end   = folio_end_sector(folio);
-                               unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
-                                       folio_start;
-                               unsigned folio_len = min(k.k->p.offset, folio_end) -
-                                       folio_offset - folio_start;
-
-                               BUG_ON(k.k->p.offset < folio_start);
-                               BUG_ON(bkey_start_offset(k.k) > folio_end);
-
-                               if (!bch2_folio(folio)->uptodate)
-                                       __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
-                               if (k.k->p.offset < folio_end)
-                                       break;
-                               folio_idx++;
-                       }
-
-                       if (folio_idx == nr_folios)
-                               break;
-                       0;
-               })));
-}
-
-void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
-{
-       struct bvec_iter iter;
-       struct folio_vec fv;
-       unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-               ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
-       unsigned state = bkey_to_sector_state(k);
-
-       bio_for_each_folio(fv, bio, iter)
-               __bch2_folio_set(fv.fv_folio,
-                                fv.fv_offset >> 9,
-                                fv.fv_len >> 9,
-                                nr_ptrs, state);
-}
-
-void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
-                                    u64 start, u64 end)
-{
-       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-       struct folio_batch fbatch;
-       unsigned i, j;
-
-       if (end <= start)
-               return;
-
-       folio_batch_init(&fbatch);
-
-       while (filemap_get_folios(inode->v.i_mapping,
-                                 &index, end_index, &fbatch)) {
-               for (i = 0; i < folio_batch_count(&fbatch); i++) {
-                       struct folio *folio = fbatch.folios[i];
-                       u64 folio_start = folio_sector(folio);
-                       u64 folio_end = folio_end_sector(folio);
-                       unsigned folio_offset = max(start, folio_start) - folio_start;
-                       unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-                       struct bch_folio *s;
-
-                       BUG_ON(end <= folio_start);
-
-                       folio_lock(folio);
-                       s = bch2_folio(folio);
-
-                       if (s) {
-                               spin_lock(&s->lock);
-                               for (j = folio_offset; j < folio_offset + folio_len; j++)
-                                       s->s[j].nr_replicas = 0;
-                               spin_unlock(&s->lock);
-                       }
-
-                       folio_unlock(folio);
-               }
-               folio_batch_release(&fbatch);
-               cond_resched();
-       }
-}
-
-int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
-                                u64 *start, u64 end,
-                                bool nonblocking)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
-       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-       struct folio_batch fbatch;
-       s64 i_sectors_delta = 0;
-       int ret = 0;
-
-       if (end <= *start)
-               return 0;
-
-       folio_batch_init(&fbatch);
-
-       while (filemap_get_folios(inode->v.i_mapping,
-                                 &index, end_index, &fbatch)) {
-               for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
-                       struct folio *folio = fbatch.folios[i];
-
-                       if (!nonblocking)
-                               folio_lock(folio);
-                       else if (!folio_trylock(folio)) {
-                               folio_batch_release(&fbatch);
-                               ret = -EAGAIN;
-                               break;
-                       }
-
-                       u64 folio_start = folio_sector(folio);
-                       u64 folio_end = folio_end_sector(folio);
-
-                       BUG_ON(end <= folio_start);
-
-                       *start = min(end, folio_end);
-
-                       struct bch_folio *s = bch2_folio(folio);
-                       if (s) {
-                               unsigned folio_offset = max(*start, folio_start) - folio_start;
-                               unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-
-                               spin_lock(&s->lock);
-                               for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
-                                       i_sectors_delta -= s->s[j].state == SECTOR_dirty;
-                                       bch2_folio_sector_set(folio, s, j,
-                                               folio_sector_reserve(s->s[j].state));
-                               }
-                               spin_unlock(&s->lock);
-                       }
-
-                       folio_unlock(folio);
-               }
-               folio_batch_release(&fbatch);
-               cond_resched();
-       }
-
-       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-       return ret;
-}
-
-static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
-                                         unsigned nr_replicas)
-{
-       return max(0, (int) nr_replicas -
-                  s->nr_replicas -
-                  s->replicas_reserved);
-}
-
-int bch2_get_folio_disk_reservation(struct bch_fs *c,
-                               struct bch_inode_info *inode,
-                               struct folio *folio, bool check_enospc)
-{
-       struct bch_folio *s = bch2_folio_create(folio, 0);
-       unsigned nr_replicas = inode_nr_replicas(c, inode);
-       struct disk_reservation disk_res = { 0 };
-       unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
-       int ret;
-
-       if (!s)
-               return -ENOMEM;
-
-       for (i = 0; i < sectors; i++)
-               disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
-
-       if (!disk_res_sectors)
-               return 0;
-
-       ret = bch2_disk_reservation_get(c, &disk_res,
-                                       disk_res_sectors, 1,
-                                       !check_enospc
-                                       ? BCH_DISK_RESERVATION_NOFAIL
-                                       : 0);
-       if (unlikely(ret))
-               return ret;
-
-       for (i = 0; i < sectors; i++)
-               s->s[i].replicas_reserved +=
-                       sectors_to_reserve(&s->s[i], nr_replicas);
-
-       return 0;
-}
-
-void bch2_folio_reservation_put(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct bch2_folio_reservation *res)
-{
-       bch2_disk_reservation_put(c, &res->disk);
-       bch2_quota_reservation_put(c, inode, &res->quota);
-}
-
-static int __bch2_folio_reservation_get(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct folio *folio,
-                       struct bch2_folio_reservation *res,
-                       size_t offset, size_t len,
-                       bool partial)
-{
-       struct bch_folio *s = bch2_folio_create(folio, 0);
-       unsigned i, disk_sectors = 0, quota_sectors = 0;
-       struct disk_reservation disk_res = {};
-       size_t reserved = len;
-       int ret;
-
-       if (!s)
-               return -ENOMEM;
-
-       BUG_ON(!s->uptodate);
-
-       for (i = round_down(offset, block_bytes(c)) >> 9;
-            i < round_up(offset + len, block_bytes(c)) >> 9;
-            i++) {
-               disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
-               quota_sectors += s->s[i].state == SECTOR_unallocated;
-       }
-
-       if (disk_sectors) {
-               ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors,
-                               partial ? BCH_DISK_RESERVATION_PARTIAL : 0);
-               if (unlikely(ret))
-                       return ret;
-
-               if (unlikely(disk_res.sectors != disk_sectors)) {
-                       disk_sectors = quota_sectors = 0;
-
-                       for (i = round_down(offset, block_bytes(c)) >> 9;
-                            i < round_up(offset + len, block_bytes(c)) >> 9;
-                            i++) {
-                               disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
-                               if (disk_sectors > disk_res.sectors) {
-                                       /*
-                                        * Make sure to get a reservation that's
-                                        * aligned to the filesystem blocksize:
-                                        */
-                                       unsigned reserved_offset = round_down(i << 9, block_bytes(c));
-                                       reserved = clamp(reserved_offset, offset, offset + len) - offset;
-
-                                       if (!reserved) {
-                                               bch2_disk_reservation_put(c, &disk_res);
-                                               return bch_err_throw(c, ENOSPC_disk_reservation);
-                                       }
-                                       break;
-                               }
-                               quota_sectors += s->s[i].state == SECTOR_unallocated;
-                       }
-               }
-       }
-
-       if (quota_sectors) {
-               ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
-               if (unlikely(ret)) {
-                       bch2_disk_reservation_put(c, &disk_res);
-                       return ret;
-               }
-       }
-
-       res->disk.sectors += disk_res.sectors;
-       return partial ? reserved : 0;
-}
-
-int bch2_folio_reservation_get(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct folio *folio,
-                       struct bch2_folio_reservation *res,
-                       size_t offset, size_t len)
-{
-       return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
-}
-
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct folio *folio,
-                       struct bch2_folio_reservation *res,
-                       size_t offset, size_t len)
-{
-       return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true);
-}
-
-static void bch2_clear_folio_bits(struct folio *folio)
-{
-       struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_folio *s = bch2_folio(folio);
-       struct disk_reservation disk_res = { 0 };
-       int i, sectors = folio_sectors(folio), dirty_sectors = 0;
-
-       if (!s)
-               return;
-
-       EBUG_ON(!folio_test_locked(folio));
-       EBUG_ON(folio_test_writeback(folio));
-
-       for (i = 0; i < sectors; i++) {
-               disk_res.sectors += s->s[i].replicas_reserved;
-               s->s[i].replicas_reserved = 0;
-
-               dirty_sectors -= s->s[i].state == SECTOR_dirty;
-               bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
-       }
-
-       bch2_disk_reservation_put(c, &disk_res);
-
-       bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
-
-       bch2_folio_release(folio);
-}
-
-void bch2_set_folio_dirty(struct bch_fs *c,
-                         struct bch_inode_info *inode,
-                         struct folio *folio,
-                         struct bch2_folio_reservation *res,
-                         unsigned offset, unsigned len)
-{
-       struct bch_folio *s = bch2_folio(folio);
-       unsigned i, dirty_sectors = 0;
-
-       WARN_ON((u64) folio_pos(folio) + offset + len >
-               round_up((u64) i_size_read(&inode->v), block_bytes(c)));
-
-       BUG_ON(!s->uptodate);
-
-       spin_lock(&s->lock);
-
-       for (i = round_down(offset, block_bytes(c)) >> 9;
-            i < round_up(offset + len, block_bytes(c)) >> 9;
-            i++) {
-               unsigned sectors = sectors_to_reserve(&s->s[i],
-                                               res->disk.nr_replicas);
-
-               /*
-                * This can happen if we race with the error path in
-                * bch2_writepage_io_done():
-                */
-               sectors = min_t(unsigned, sectors, res->disk.sectors);
-
-               s->s[i].replicas_reserved += sectors;
-               res->disk.sectors -= sectors;
-
-               dirty_sectors += s->s[i].state == SECTOR_unallocated;
-
-               bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
-       }
-
-       spin_unlock(&s->lock);
-
-       bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-
-       if (!folio_test_dirty(folio))
-               filemap_dirty_folio(inode->v.i_mapping, folio);
-}
-
-vm_fault_t bch2_page_fault(struct vm_fault *vmf)
-{
-       struct file *file = vmf->vma->vm_file;
-       struct address_space *mapping = file->f_mapping;
-       struct address_space *fdm = faults_disabled_mapping();
-       struct bch_inode_info *inode = file_bch_inode(file);
-       vm_fault_t ret;
-
-       if (fdm == mapping)
-               return VM_FAULT_SIGBUS;
-
-       /* Lock ordering: */
-       if (fdm > mapping) {
-               struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
-
-               if (bch2_pagecache_add_tryget(inode))
-                       goto got_lock;
-
-               bch2_pagecache_block_put(fdm_host);
-
-               bch2_pagecache_add_get(inode);
-               bch2_pagecache_add_put(inode);
-
-               bch2_pagecache_block_get(fdm_host);
-
-               /* Signal that lock has been dropped: */
-               set_fdm_dropped_locks();
-               return VM_FAULT_SIGBUS;
-       }
-
-       bch2_pagecache_add_get(inode);
-got_lock:
-       ret = filemap_fault(vmf);
-       bch2_pagecache_add_put(inode);
-
-       return ret;
-}
-
-vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
-{
-       struct folio *folio = page_folio(vmf->page);
-       struct file *file = vmf->vma->vm_file;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct address_space *mapping = file->f_mapping;
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation res;
-       vm_fault_t ret;
-
-       loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c));
-       unsigned offset = file_offset - folio_pos(folio);
-       unsigned len = max(PAGE_SIZE, block_bytes(c));
-
-       BUG_ON(offset + len > folio_size(folio));
-
-       bch2_folio_reservation_init(c, inode, &res);
-
-       sb_start_pagefault(inode->v.i_sb);
-       file_update_time(file);
-
-       /*
-        * Not strictly necessary, but helps avoid dio writes livelocking in
-        * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
-        * a bch2_write_invalidate_inode_pages_range() that works without dropping
-        * page lock before invalidating page
-        */
-       bch2_pagecache_add_get(inode);
-
-       folio_lock(folio);
-       u64 isize = i_size_read(&inode->v);
-
-       if (folio->mapping != mapping || file_offset >= isize) {
-               folio_unlock(folio);
-               ret = VM_FAULT_NOPAGE;
-               goto out;
-       }
-
-       len = min_t(unsigned, len, isize - file_offset);
-
-       if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
-           bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) {
-               folio_unlock(folio);
-               ret = VM_FAULT_SIGBUS;
-               goto out;
-       }
-
-       bch2_set_folio_dirty(c, inode, folio, &res, offset, len);
-       bch2_folio_reservation_put(c, inode, &res);
-
-       folio_wait_stable(folio);
-       ret = VM_FAULT_LOCKED;
-out:
-       bch2_pagecache_add_put(inode);
-       sb_end_pagefault(inode->v.i_sb);
-
-       return ret;
-}
-
-void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
-{
-       if (offset || length < folio_size(folio))
-               return;
-
-       bch2_clear_folio_bits(folio);
-}
-
-bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
-{
-       if (folio_test_dirty(folio) || folio_test_writeback(folio))
-               return false;
-
-       bch2_clear_folio_bits(folio);
-       return true;
-}
-
-/* fseek: */
-
-static int folio_data_offset(struct folio *folio, loff_t pos,
-                            unsigned min_replicas)
-{
-       struct bch_folio *s = bch2_folio(folio);
-       unsigned i, sectors = folio_sectors(folio);
-
-       if (s)
-               for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
-                       if (s->s[i].state >= SECTOR_dirty &&
-                           s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
-                               return i << SECTOR_SHIFT;
-
-       return -1;
-}
-
-loff_t bch2_seek_pagecache_data(struct inode *vinode,
-                               loff_t start_offset,
-                               loff_t end_offset,
-                               unsigned min_replicas,
-                               bool nonblock)
-{
-       struct folio_batch fbatch;
-       pgoff_t start_index     = start_offset >> PAGE_SHIFT;
-       pgoff_t end_index       = end_offset >> PAGE_SHIFT;
-       pgoff_t index           = start_index;
-       unsigned i;
-       loff_t ret;
-       int offset;
-
-       folio_batch_init(&fbatch);
-
-       while (filemap_get_folios(vinode->i_mapping,
-                                 &index, end_index, &fbatch)) {
-               for (i = 0; i < folio_batch_count(&fbatch); i++) {
-                       struct folio *folio = fbatch.folios[i];
-
-                       if (!nonblock) {
-                               folio_lock(folio);
-                       } else if (!folio_trylock(folio)) {
-                               folio_batch_release(&fbatch);
-                               return -EAGAIN;
-                       }
-
-                       offset = folio_data_offset(folio,
-                                       max(folio_pos(folio), start_offset),
-                                       min_replicas);
-                       if (offset >= 0) {
-                               ret = clamp(folio_pos(folio) + offset,
-                                           start_offset, end_offset);
-                               folio_unlock(folio);
-                               folio_batch_release(&fbatch);
-                               return ret;
-                       }
-                       folio_unlock(folio);
-               }
-               folio_batch_release(&fbatch);
-               cond_resched();
-       }
-
-       return end_offset;
-}
-
-/*
- * Search for a hole in a folio.
- *
- * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
- * code to indicate a pagecache hole exists at the returned offset. Otherwise
- * return 0 if the folio is filled with data, or an error code. This function
- * can return -EAGAIN if nonblock is specified.
- */
-static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
-                             unsigned min_replicas, bool nonblock)
-{
-       struct folio *folio;
-       struct bch_folio *s;
-       unsigned i, sectors;
-       int ret = -ENOENT;
-
-       folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
-                                   FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
-       if (IS_ERR(folio))
-               return PTR_ERR(folio);
-
-       s = bch2_folio(folio);
-       if (!s)
-               goto unlock;
-
-       sectors = folio_sectors(folio);
-       for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
-               if (s->s[i].state < SECTOR_dirty ||
-                   s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
-                       *offset = max(*offset,
-                                     folio_pos(folio) + (i << SECTOR_SHIFT));
-                       goto unlock;
-               }
-
-       *offset = folio_end_pos(folio);
-       ret = 0;
-unlock:
-       folio_unlock(folio);
-       folio_put(folio);
-       return ret;
-}
-
-loff_t bch2_seek_pagecache_hole(struct inode *vinode,
-                               loff_t start_offset,
-                               loff_t end_offset,
-                               unsigned min_replicas,
-                               bool nonblock)
-{
-       struct address_space *mapping = vinode->i_mapping;
-       loff_t offset = start_offset;
-       loff_t ret = 0;
-
-       while (!ret && offset < end_offset)
-               ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
-
-       if (ret && ret != -ENOENT)
-               return ret;
-       return min(offset, end_offset);
-}
-
-int bch2_clamp_data_hole(struct inode *inode,
-                        u64 *hole_start,
-                        u64 *hole_end,
-                        unsigned min_replicas,
-                        bool nonblock)
-{
-       loff_t ret;
-
-       ret = bch2_seek_pagecache_hole(inode,
-               *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-       if (ret < 0)
-               return ret;
-
-       *hole_start = ret;
-
-       if (*hole_start == *hole_end)
-               return 0;
-
-       ret = bch2_seek_pagecache_data(inode,
-               *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-       if (ret < 0)
-               return ret;
-
-       *hole_end = ret;
-       return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
deleted file mode 100644 (file)
index fad911c..0000000
+++ /dev/null
@@ -1,176 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
-#define _BCACHEFS_FS_IO_PAGECACHE_H
-
-#include <linux/pagemap.h>
-
-typedef DARRAY(struct folio *) folios;
-
-int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
-                                    u64, fgf_t, gfp_t, folios *);
-int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
-
-/*
- * Use u64 for the end pos and sector helpers because if the folio covers the
- * max supported range of the mapping, the start offset of the next folio
- * overflows loff_t. This breaks much of the range based processing in the
- * buffered write path.
- */
-static inline u64 folio_end_pos(struct folio *folio)
-{
-       return folio_pos(folio) + folio_size(folio);
-}
-
-static inline size_t folio_sectors(struct folio *folio)
-{
-       return PAGE_SECTORS << folio_order(folio);
-}
-
-static inline loff_t folio_sector(struct folio *folio)
-{
-       return folio_pos(folio) >> 9;
-}
-
-static inline u64 folio_end_sector(struct folio *folio)
-{
-       return folio_end_pos(folio) >> 9;
-}
-
-#define BCH_FOLIO_SECTOR_STATE()       \
-       x(unallocated)                  \
-       x(reserved)                     \
-       x(dirty)                        \
-       x(dirty_reserved)               \
-       x(allocated)
-
-enum bch_folio_sector_state {
-#define x(n)   SECTOR_##n,
-       BCH_FOLIO_SECTOR_STATE()
-#undef x
-};
-
-struct bch_folio_sector {
-       /* Uncompressed, fully allocated replicas (or on disk reservation): */
-       u8                      nr_replicas:4,
-       /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
-                               replicas_reserved:4;
-       u8                      state;
-};
-
-struct bch_folio {
-       spinlock_t              lock;
-       atomic_t                write_count;
-       /*
-        * Is the sector state up to date with the btree?
-        * (Not the data itself)
-        */
-       bool                    uptodate;
-       struct bch_folio_sector s[];
-};
-
-/* Helper for when we need to add debug instrumentation: */
-static inline void bch2_folio_sector_set(struct folio *folio,
-                            struct bch_folio *s,
-                            unsigned i, unsigned n)
-{
-       s->s[i].state = n;
-}
-
-/* file offset (to folio offset) to bch_folio_sector index */
-static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
-{
-       u64 f_offset = pos - folio_pos(folio);
-
-       BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
-       return f_offset >> SECTOR_SHIFT;
-}
-
-/* for newly allocated folios: */
-static inline void __bch2_folio_release(struct folio *folio)
-{
-       kfree(folio_detach_private(folio));
-}
-
-static inline void bch2_folio_release(struct folio *folio)
-{
-       EBUG_ON(!folio_test_locked(folio));
-       __bch2_folio_release(folio);
-}
-
-static inline struct bch_folio *__bch2_folio(struct folio *folio)
-{
-       return folio_get_private(folio);
-}
-
-static inline struct bch_folio *bch2_folio(struct folio *folio)
-{
-       EBUG_ON(!folio_test_locked(folio));
-
-       return __bch2_folio(folio);
-}
-
-struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
-struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
-
-struct bch2_folio_reservation {
-       struct disk_reservation disk;
-       struct quota_res        quota;
-};
-
-static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
-{
-       /* XXX: this should not be open coded */
-       return inode->ei_inode.bi_data_replicas
-               ? inode->ei_inode.bi_data_replicas - 1
-               : c->opts.data_replicas;
-}
-
-static inline void bch2_folio_reservation_init(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct bch2_folio_reservation *res)
-{
-       memset(res, 0, sizeof(*res));
-
-       res->disk.nr_replicas = inode_nr_replicas(c, inode);
-}
-
-int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
-void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
-
-void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
-
-int bch2_get_folio_disk_reservation(struct bch_fs *,
-                               struct bch_inode_info *,
-                               struct folio *, bool);
-
-void bch2_folio_reservation_put(struct bch_fs *,
-                       struct bch_inode_info *,
-                       struct bch2_folio_reservation *);
-int bch2_folio_reservation_get(struct bch_fs *,
-                       struct bch_inode_info *,
-                       struct folio *,
-                       struct bch2_folio_reservation *,
-                       size_t, size_t);
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *,
-                       struct bch_inode_info *,
-                       struct folio *,
-                       struct bch2_folio_reservation *,
-                       size_t, size_t);
-
-void bch2_set_folio_dirty(struct bch_fs *,
-                         struct bch_inode_info *,
-                         struct folio *,
-                         struct bch2_folio_reservation *,
-                         unsigned, unsigned);
-
-vm_fault_t bch2_page_fault(struct vm_fault *);
-vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidate_folio(struct folio *, size_t, size_t);
-bool bch2_release_folio(struct folio *, gfp_t);
-
-loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
-loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
-int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
-
-#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
deleted file mode 100644 (file)
index a233f45..0000000
+++ /dev/null
@@ -1,1102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-pagecache.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal.h"
-#include "io_misc.h"
-#include "keylist.h"
-#include "quota.h"
-#include "reflink.h"
-#include "trace.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/falloc.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/pagevec.h>
-#include <linux/rmap.h>
-#include <linux/sched/signal.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/uio.h>
-
-#include <trace/events/writeback.h>
-
-struct nocow_flush {
-       struct closure  *cl;
-       struct bch_dev  *ca;
-       struct bio      bio;
-};
-
-static void nocow_flush_endio(struct bio *_bio)
-{
-
-       struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
-
-       closure_put(bio->cl);
-       enumerated_ref_put(&bio->ca->io_ref[WRITE],
-                          BCH_DEV_WRITE_REF_nocow_flush);
-       bio_put(&bio->bio);
-}
-
-void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
-                                        struct bch_inode_info *inode,
-                                        struct closure *cl)
-{
-       struct nocow_flush *bio;
-       struct bch_dev *ca;
-       struct bch_devs_mask devs;
-       unsigned dev;
-
-       dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
-       if (dev == BCH_SB_MEMBERS_MAX)
-               return;
-
-       devs = inode->ei_devs_need_flush;
-       memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
-       for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
-               scoped_guard(rcu) {
-                       ca = rcu_dereference(c->devs[dev]);
-                       if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE],
-                                                        BCH_DEV_WRITE_REF_nocow_flush))
-                               ca = NULL;
-               }
-
-               if (!ca)
-                       continue;
-
-               bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
-                                                   REQ_OP_WRITE|REQ_PREFLUSH,
-                                                   GFP_KERNEL,
-                                                   &c->nocow_flush_bioset),
-                                  struct nocow_flush, bio);
-               bio->cl                 = cl;
-               bio->ca                 = ca;
-               bio->bio.bi_end_io      = nocow_flush_endio;
-               closure_bio_submit(&bio->bio, cl);
-       }
-}
-
-static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
-                                        struct bch_inode_info *inode)
-{
-       struct closure cl;
-
-       closure_init_stack(&cl);
-       bch2_inode_flush_nocow_writes_async(c, inode, &cl);
-       closure_sync(&cl);
-
-       return 0;
-}
-
-/* i_size updates: */
-
-struct inode_new_size {
-       loff_t          new_size;
-       u64             now;
-       unsigned        fields;
-};
-
-static int inode_set_size(struct btree_trans *trans,
-                         struct bch_inode_info *inode,
-                         struct bch_inode_unpacked *bi,
-                         void *p)
-{
-       struct inode_new_size *s = p;
-
-       bi->bi_size = s->new_size;
-       if (s->fields & ATTR_ATIME)
-               bi->bi_atime = s->now;
-       if (s->fields & ATTR_MTIME)
-               bi->bi_mtime = s->now;
-       if (s->fields & ATTR_CTIME)
-               bi->bi_ctime = s->now;
-
-       return 0;
-}
-
-int __must_check bch2_write_inode_size(struct bch_fs *c,
-                                      struct bch_inode_info *inode,
-                                      loff_t new_size, unsigned fields)
-{
-       struct inode_new_size s = {
-               .new_size       = new_size,
-               .now            = bch2_current_time(c),
-               .fields         = fields,
-       };
-
-       return bch2_write_inode(c, inode, inode_set_size, &s, fields);
-}
-
-void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-                          struct quota_res *quota_res, s64 sectors)
-{
-       if (unlikely((s64) inode->v.i_blocks + sectors < 0)) {
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-               prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
-                          inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
-                          inode->ei_inode.bi_sectors);
-
-               bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf);
-               if (print)
-                       bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-
-               if (sectors < 0)
-                       sectors = -inode->v.i_blocks;
-               else
-                       sectors = 0;
-       }
-
-       inode->v.i_blocks += sectors;
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-       if (quota_res &&
-           !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
-           sectors > 0) {
-               BUG_ON(sectors > quota_res->sectors);
-               BUG_ON(sectors > inode->ei_quota_reserved);
-
-               quota_res->sectors -= sectors;
-               inode->ei_quota_reserved -= sectors;
-       } else {
-               bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
-       }
-#endif
-}
-
-/* fsync: */
-
-static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum,
-                                           u64 *seq)
-{
-       struct printbuf buf = PRINTBUF;
-       struct bch_inode_unpacked u;
-       struct btree_iter iter;
-       int ret = bch2_inode_peek(trans, &iter, &u, inum, 0);
-       if (ret)
-               return ret;
-
-       u64 cur_seq = journal_cur_seq(&trans->c->journal);
-       *seq = min(cur_seq, u.bi_journal_seq);
-
-       if (fsck_err_on(u.bi_journal_seq > cur_seq,
-                       trans, inode_journal_seq_in_future,
-                       "inode journal seq in future (currently at %llu)\n%s",
-                       cur_seq,
-                       (bch2_inode_unpacked_to_text(&buf, &u),
-                       buf.buf))) {
-               u.bi_journal_seq = cur_seq;
-               ret = bch2_inode_write(trans, &iter, &u);
-       }
-fsck_err:
-       bch2_trans_iter_exit(trans, &iter);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-/*
- * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
- * insert trigger: look up the btree inode instead
- */
-static int bch2_flush_inode(struct bch_fs *c,
-                           struct bch_inode_info *inode)
-{
-       if (c->opts.journal_flush_disabled)
-               return 0;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync))
-               return -EROFS;
-
-       u64 seq;
-       int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
-                       bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
-                 bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
-                 bch2_inode_flush_nocow_writes(c, inode);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync);
-       return ret;
-}
-
-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       int ret, err;
-
-       trace_bch2_fsync(file, datasync);
-
-       ret = file_write_and_wait_range(file, start, end);
-       if (ret)
-               goto out;
-       ret = sync_inode_metadata(&inode->v, 1);
-       if (ret)
-               goto out;
-       ret = bch2_flush_inode(c, inode);
-out:
-       ret = bch2_err_class(ret);
-       if (ret == -EROFS)
-               ret = -EIO;
-
-       err = file_check_and_advance_wb_err(file);
-       if (!ret)
-               ret = err;
-
-       return ret;
-}
-
-/* truncate: */
-
-static inline int range_has_data(struct bch_fs *c, u32 subvol,
-                                struct bpos start,
-                                struct bpos end)
-{
-       return bch2_trans_run(c,
-               for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end,
-                                                   subvol, 0, k, ({
-                       bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
-               })));
-}
-
-static int __bch2_truncate_folio(struct bch_inode_info *inode,
-                                pgoff_t index, loff_t start, loff_t end)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct address_space *mapping = inode->v.i_mapping;
-       struct bch_folio *s;
-       unsigned start_offset;
-       unsigned end_offset;
-       unsigned i;
-       struct folio *folio;
-       s64 i_sectors_delta = 0;
-       int ret = 0;
-       u64 end_pos;
-
-       folio = filemap_lock_folio(mapping, index);
-       if (IS_ERR_OR_NULL(folio)) {
-               /*
-                * XXX: we're doing two index lookups when we end up reading the
-                * folio
-                */
-               ret = range_has_data(c, inode->ei_inum.subvol,
-                               POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
-                               POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
-               if (ret <= 0)
-                       return ret;
-
-               folio = __filemap_get_folio(mapping, index,
-                                           FGP_LOCK|FGP_CREAT, GFP_KERNEL);
-               if (IS_ERR(folio)) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
-       }
-
-       BUG_ON(start    >= folio_end_pos(folio));
-       BUG_ON(end      <= folio_pos(folio));
-
-       start_offset    = max(start, folio_pos(folio)) - folio_pos(folio);
-       end_offset      = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
-
-       /* Folio boundary? Nothing to do */
-       if (start_offset == 0 &&
-           end_offset == folio_size(folio)) {
-               ret = 0;
-               goto unlock;
-       }
-
-       s = bch2_folio_create(folio, 0);
-       if (!s) {
-               ret = -ENOMEM;
-               goto unlock;
-       }
-
-       if (!folio_test_uptodate(folio)) {
-               ret = bch2_read_single_folio(folio, mapping);
-               if (ret)
-                       goto unlock;
-       }
-
-       ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-       if (ret)
-               goto unlock;
-
-       for (i = round_up(start_offset, block_bytes(c)) >> 9;
-            i < round_down(end_offset, block_bytes(c)) >> 9;
-            i++) {
-               s->s[i].nr_replicas     = 0;
-
-               i_sectors_delta -= s->s[i].state == SECTOR_dirty;
-               bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
-       }
-
-       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-       /*
-        * Caller needs to know whether this folio will be written out by
-        * writeback - doing an i_size update if necessary - or whether it will
-        * be responsible for the i_size update.
-        *
-        * Note that we shouldn't ever see a folio beyond EOF, but check and
-        * warn if so. This has been observed by failure to clean up folios
-        * after a short write and there's still a chance reclaim will fix
-        * things up.
-        */
-       WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
-       end_pos = folio_end_pos(folio);
-       if (inode->v.i_size > folio_pos(folio))
-               end_pos = min_t(u64, inode->v.i_size, end_pos);
-       ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
-
-       folio_zero_segment(folio, start_offset, end_offset);
-
-       /*
-        * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
-        *
-        * XXX: because we aren't currently tracking whether the folio has actual
-        * data in it (vs. just 0s, or only partially written) this wrong. ick.
-        */
-       BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
-
-       /*
-        * This removes any writeable userspace mappings; we need to force
-        * .page_mkwrite to be called again before any mmapped writes, to
-        * redirty the full page:
-        */
-       folio_mkclean(folio);
-       filemap_dirty_folio(mapping, folio);
-unlock:
-       folio_unlock(folio);
-       folio_put(folio);
-out:
-       return ret;
-}
-
-static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
-{
-       return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
-                                    from, ANYSINT_MAX(loff_t));
-}
-
-static int bch2_truncate_folios(struct bch_inode_info *inode,
-                               loff_t start, loff_t end)
-{
-       int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
-                                       start, end);
-
-       if (ret >= 0 &&
-           start >> PAGE_SHIFT != end >> PAGE_SHIFT)
-               ret = __bch2_truncate_folio(inode,
-                                       (end - 1) >> PAGE_SHIFT,
-                                       start, end);
-       return ret;
-}
-
-static int bch2_extend(struct mnt_idmap *idmap,
-                      struct bch_inode_info *inode,
-                      struct bch_inode_unpacked *inode_u,
-                      struct iattr *iattr)
-{
-       struct address_space *mapping = inode->v.i_mapping;
-       int ret;
-
-       /*
-        * sync appends:
-        *
-        * this has to be done _before_ extending i_size:
-        */
-       ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
-       if (ret)
-               return ret;
-
-       truncate_setsize(&inode->v, iattr->ia_size);
-
-       return bch2_setattr_nonsize(idmap, inode, iattr);
-}
-
-int bchfs_truncate(struct mnt_idmap *idmap,
-                 struct bch_inode_info *inode, struct iattr *iattr)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct address_space *mapping = inode->v.i_mapping;
-       struct bch_inode_unpacked inode_u;
-       s64 i_sectors_delta = 0;
-       int ret = 0;
-
-       /*
-        * If the truncate call with change the size of the file, the
-        * cmtimes should be updated. If the size will not change, we
-        * do not need to update the cmtimes.
-        */
-       if (iattr->ia_size != inode->v.i_size) {
-               if (!(iattr->ia_valid & ATTR_MTIME))
-                       ktime_get_coarse_real_ts64(&iattr->ia_mtime);
-               if (!(iattr->ia_valid & ATTR_CTIME))
-                       ktime_get_coarse_real_ts64(&iattr->ia_ctime);
-               iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
-       }
-
-       inode_dio_wait(&inode->v);
-       bch2_pagecache_block_get(inode);
-
-       ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
-       if (ret)
-               goto err;
-
-       /*
-        * check this before next assertion; on filesystem error our normal
-        * invariants are a bit broken (truncate has to truncate the page cache
-        * before the inode).
-        */
-       ret = bch2_journal_error(&c->journal);
-       if (ret)
-               goto err;
-
-       WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
-                 inode->v.i_size < inode_u.bi_size,
-                 "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
-                 (u64) inode->v.i_size, inode_u.bi_size);
-
-       if (iattr->ia_size > inode->v.i_size) {
-               ret = bch2_extend(idmap, inode, &inode_u, iattr);
-               goto err;
-       }
-
-       iattr->ia_valid &= ~ATTR_SIZE;
-
-       ret = bch2_truncate_folio(inode, iattr->ia_size);
-       if (unlikely(ret < 0))
-               goto err;
-       ret = 0;
-
-       truncate_setsize(&inode->v, iattr->ia_size);
-
-       /*
-        * When extending, we're going to write the new i_size to disk
-        * immediately so we need to flush anything above the current on disk
-        * i_size first:
-        *
-        * Also, when extending we need to flush the page that i_size currently
-        * straddles - if it's mapped to userspace, we need to ensure that
-        * userspace has to redirty it and call .mkwrite -> set_page_dirty
-        * again to allocate the part of the page that was extended.
-        */
-       if (iattr->ia_size > inode_u.bi_size)
-               ret = filemap_write_and_wait_range(mapping,
-                               inode_u.bi_size,
-                               iattr->ia_size - 1);
-       else if (iattr->ia_size & (PAGE_SIZE - 1))
-               ret = filemap_write_and_wait_range(mapping,
-                               round_down(iattr->ia_size, PAGE_SIZE),
-                               iattr->ia_size - 1);
-       if (ret)
-               goto err;
-
-       ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
-       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-       if (unlikely(ret)) {
-               /*
-                * If we error here, VFS caches are now inconsistent with btree
-                */
-               set_bit(EI_INODE_ERROR, &inode->ei_flags);
-               goto err;
-       }
-
-       if (unlikely(!inode->v.i_size && inode->v.i_blocks &&
-                    !bch2_journal_error(&c->journal))) {
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-               prt_printf(&buf,
-                          "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
-                          inode->v.i_ino, (u64) inode->v.i_blocks,
-                          inode->ei_inode.bi_sectors);
-
-               bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf);
-               if (print)
-                       bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       ret = bch2_setattr_nonsize(idmap, inode, iattr);
-err:
-       bch2_pagecache_block_put(inode);
-       return bch2_err_class(ret);
-}
-
-/* fallocate: */
-
-static int inode_update_times_fn(struct btree_trans *trans,
-                                struct bch_inode_info *inode,
-                                struct bch_inode_unpacked *bi, void *p)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-       bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
-       return 0;
-}
-
-static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       u64 end         = offset + len;
-       u64 block_start = round_up(offset, block_bytes(c));
-       u64 block_end   = round_down(end, block_bytes(c));
-       bool truncated_last_page;
-       int ret = 0;
-
-       ret = bch2_truncate_folios(inode, offset, end);
-       if (unlikely(ret < 0))
-               goto err;
-
-       truncated_last_page = ret;
-
-       truncate_pagecache_range(&inode->v, offset, end - 1);
-
-       if (block_start < block_end) {
-               s64 i_sectors_delta = 0;
-
-               ret = bch2_fpunch(c, inode_inum(inode),
-                                 block_start >> 9, block_end >> 9,
-                                 &i_sectors_delta);
-               bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-       }
-
-       mutex_lock(&inode->ei_update_lock);
-       if (end >= inode->v.i_size && !truncated_last_page) {
-               ret = bch2_write_inode_size(c, inode, inode->v.i_size,
-                                           ATTR_MTIME|ATTR_CTIME);
-       } else {
-               ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-                                      ATTR_MTIME|ATTR_CTIME);
-       }
-       mutex_unlock(&inode->ei_update_lock);
-err:
-       return ret;
-}
-
-static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
-                                  loff_t offset, loff_t len,
-                                  bool insert)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct address_space *mapping = inode->v.i_mapping;
-       s64 i_sectors_delta = 0;
-       int ret = 0;
-
-       if ((offset | len) & (block_bytes(c) - 1))
-               return -EINVAL;
-
-       if (insert) {
-               if (offset >= inode->v.i_size)
-                       return -EINVAL;
-       } else {
-               if (offset + len >= inode->v.i_size)
-                       return -EINVAL;
-       }
-
-       ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
-       if (ret)
-               return ret;
-
-       if (insert)
-               i_size_write(&inode->v, inode->v.i_size + len);
-
-       ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
-                                    insert, &i_sectors_delta);
-       if (!ret && !insert)
-               i_size_write(&inode->v, inode->v.i_size - len);
-       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-       return ret;
-}
-
-static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
-                            u64 start_sector, u64 end_sector)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       struct bpos end_pos = POS(inode->v.i_ino, end_sector);
-       struct bch_io_opts opts;
-       int ret = 0;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                       POS(inode->v.i_ino, start_sector),
-                       BTREE_ITER_slots|BTREE_ITER_intent);
-
-       while (!ret) {
-               s64 i_sectors_delta = 0;
-               struct quota_res quota_res = { 0 };
-               struct bkey_s_c k;
-               unsigned sectors;
-               bool is_allocation;
-               u64 hole_start, hole_end;
-               u32 snapshot;
-
-               bch2_trans_begin(trans);
-
-               if (bkey_ge(iter.pos, end_pos))
-                       break;
-
-               ret = bch2_subvolume_get_snapshot(trans,
-                                       inode->ei_inum.subvol, &snapshot);
-               if (ret)
-                       goto bkey_err;
-
-               bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
-               k = bch2_btree_iter_peek_slot(trans, &iter);
-               if ((ret = bkey_err(k)))
-                       goto bkey_err;
-
-               hole_start      = iter.pos.offset;
-               hole_end        = bpos_min(k.k->p, end_pos).offset;
-               is_allocation   = bkey_extent_is_allocation(k.k);
-
-               /* already reserved */
-               if (bkey_extent_is_reservation(k) &&
-                   bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
-                       bch2_btree_iter_advance(trans, &iter);
-                       continue;
-               }
-
-               if (bkey_extent_is_data(k.k) &&
-                   !(mode & FALLOC_FL_ZERO_RANGE)) {
-                       bch2_btree_iter_advance(trans, &iter);
-                       continue;
-               }
-
-               if (!(mode & FALLOC_FL_ZERO_RANGE)) {
-                       /*
-                        * Lock ordering - can't be holding btree locks while
-                        * blocking on a folio lock:
-                        */
-                       if (bch2_clamp_data_hole(&inode->v,
-                                                &hole_start,
-                                                &hole_end,
-                                                opts.data_replicas, true)) {
-                               ret = drop_locks_do(trans,
-                                       (bch2_clamp_data_hole(&inode->v,
-                                                             &hole_start,
-                                                             &hole_end,
-                                                             opts.data_replicas, false), 0));
-                               if (ret)
-                                       goto bkey_err;
-                       }
-                       bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start));
-
-                       if (ret)
-                               goto bkey_err;
-
-                       if (hole_start == hole_end)
-                               continue;
-               }
-
-               sectors = hole_end - hole_start;
-
-               if (!is_allocation) {
-                       ret = bch2_quota_reservation_add(c, inode,
-                                       &quota_res, sectors, true);
-                       if (unlikely(ret))
-                               goto bkey_err;
-               }
-
-               ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
-                                           sectors, opts, &i_sectors_delta,
-                                           writepoint_hashed((unsigned long) current));
-               if (ret)
-                       goto bkey_err;
-
-               bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
-
-               if (bch2_mark_pagecache_reserved(inode, &hole_start,
-                                                iter.pos.offset, true)) {
-                       ret = drop_locks_do(trans,
-                               bch2_mark_pagecache_reserved(inode, &hole_start,
-                                                            iter.pos.offset, false));
-                       if (ret)
-                               goto bkey_err;
-               }
-bkey_err:
-               bch2_quota_reservation_put(c, inode, &quota_res);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       ret = 0;
-       }
-
-       if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
-               struct quota_res quota_res = { 0 };
-               s64 i_sectors_delta = 0;
-
-               bch2_fpunch_at(trans, &iter, inode_inum(inode),
-                              end_sector, &i_sectors_delta);
-               bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
-               bch2_quota_reservation_put(c, inode, &quota_res);
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-       return ret;
-}
-
-static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode,
-                           loff_t offset, loff_t len)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       u64 end         = offset + len;
-       u64 block_start = round_down(offset,    block_bytes(c));
-       u64 block_end   = round_up(end,         block_bytes(c));
-       bool truncated_last_page = false;
-       int ret, ret2 = 0;
-
-       if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
-               ret = inode_newsize_ok(&inode->v, end);
-               if (ret)
-                       return ret;
-       }
-
-       if (mode & FALLOC_FL_ZERO_RANGE) {
-               ret = bch2_truncate_folios(inode, offset, end);
-               if (unlikely(ret < 0))
-                       return ret;
-
-               truncated_last_page = ret;
-
-               truncate_pagecache_range(&inode->v, offset, end - 1);
-
-               block_start     = round_up(offset,      block_bytes(c));
-               block_end       = round_down(end,       block_bytes(c));
-       }
-
-       ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
-
-       /*
-        * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
-        * so that the VFS cache i_size is consistent with the btree i_size:
-        */
-       if (ret &&
-           !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
-               return ret;
-
-       if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
-               end = inode->v.i_size;
-
-       if (end >= inode->v.i_size &&
-           (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
-            !(mode & FALLOC_FL_KEEP_SIZE))) {
-               spin_lock(&inode->v.i_lock);
-               i_size_write(&inode->v, end);
-               spin_unlock(&inode->v.i_lock);
-
-               mutex_lock(&inode->ei_update_lock);
-               ret2 = bch2_write_inode_size(c, inode, end, 0);
-               mutex_unlock(&inode->ei_update_lock);
-       }
-
-       return ret ?: ret2;
-}
-
-long bch2_fallocate_dispatch(struct file *file, int mode,
-                            loff_t offset, loff_t len)
-{
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       long ret;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate))
-               return -EROFS;
-
-       inode_lock(&inode->v);
-       inode_dio_wait(&inode->v);
-       bch2_pagecache_block_get(inode);
-
-       ret = file_modified(file);
-       if (ret)
-               goto err;
-
-       if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
-               ret = bchfs_fallocate(inode, mode, offset, len);
-       else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
-               ret = bchfs_fpunch(inode, offset, len);
-       else if (mode == FALLOC_FL_INSERT_RANGE)
-               ret = bchfs_fcollapse_finsert(inode, offset, len, true);
-       else if (mode == FALLOC_FL_COLLAPSE_RANGE)
-               ret = bchfs_fcollapse_finsert(inode, offset, len, false);
-       else
-               ret = -EOPNOTSUPP;
-err:
-       bch2_pagecache_block_put(inode);
-       inode_unlock(&inode->v);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate);
-
-       return bch2_err_class(ret);
-}
-
-/*
- * Take a quota reservation for unallocated blocks in a given file range
- * Does not check pagecache
- */
-static int quota_reserve_range(struct bch_inode_info *inode,
-                              struct quota_res *res,
-                              u64 start, u64 end)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       u64 sectors = end - start;
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_in_subvolume_max(trans, iter,
-                               BTREE_ID_extents,
-                               POS(inode->v.i_ino, start),
-                               POS(inode->v.i_ino, end - 1),
-                               inode->ei_inum.subvol, 0, k, ({
-                       if (bkey_extent_is_allocation(k.k)) {
-                               u64 s = min(end, k.k->p.offset) -
-                                       max(start, bkey_start_offset(k.k));
-                               BUG_ON(s > sectors);
-                               sectors -= s;
-                       }
-
-                       0;
-               })));
-
-       return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
-}
-
-loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
-                            struct file *file_dst, loff_t pos_dst,
-                            loff_t len, unsigned remap_flags)
-{
-       struct bch_inode_info *src = file_bch_inode(file_src);
-       struct bch_inode_info *dst = file_bch_inode(file_dst);
-       struct bch_fs *c = src->v.i_sb->s_fs_info;
-       struct quota_res quota_res = { 0 };
-       s64 i_sectors_delta = 0;
-       u64 aligned_len;
-       loff_t ret = 0;
-
-       if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
-               return -EINVAL;
-
-       if ((pos_src & (block_bytes(c) - 1)) ||
-           (pos_dst & (block_bytes(c) - 1)))
-               return -EINVAL;
-
-       if (src == dst &&
-           abs(pos_src - pos_dst) < len)
-               return -EINVAL;
-
-       lock_two_nondirectories(&src->v, &dst->v);
-       bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
-
-       inode_dio_wait(&src->v);
-       inode_dio_wait(&dst->v);
-
-       ret = generic_remap_file_range_prep(file_src, pos_src,
-                                           file_dst, pos_dst,
-                                           &len, remap_flags);
-       if (ret < 0 || len == 0)
-               goto err;
-
-       aligned_len = round_up((u64) len, block_bytes(c));
-
-       ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
-                               pos_dst, pos_dst + len - 1);
-       if (ret)
-               goto err;
-
-       ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
-                                 (pos_dst + aligned_len) >> 9);
-       if (ret)
-               goto err;
-
-       if (!(remap_flags & REMAP_FILE_DEDUP))
-               file_update_time(file_dst);
-
-       bch2_mark_pagecache_unallocated(src, pos_src >> 9,
-                                  (pos_src + aligned_len) >> 9);
-
-       /*
-        * XXX: we'd like to be telling bch2_remap_range() if we have
-        * permission to write to the source file, and thus if io path option
-        * changes should be propagated through the copy, but we need mnt_idmap
-        * from the pathwalk, awkward
-        */
-       ret = bch2_remap_range(c,
-                              inode_inum(dst), pos_dst >> 9,
-                              inode_inum(src), pos_src >> 9,
-                              aligned_len >> 9,
-                              pos_dst + len, &i_sectors_delta,
-                              false);
-       if (ret < 0)
-               goto err;
-
-       /*
-        * due to alignment, we might have remapped slightly more than requsted
-        */
-       ret = min((u64) ret << 9, (u64) len);
-
-       bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
-
-       spin_lock(&dst->v.i_lock);
-       if (pos_dst + ret > dst->v.i_size)
-               i_size_write(&dst->v, pos_dst + ret);
-       spin_unlock(&dst->v.i_lock);
-
-       if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
-           IS_SYNC(file_inode(file_dst)))
-               ret = bch2_flush_inode(c, dst);
-err:
-       bch2_quota_reservation_put(c, dst, &quota_res);
-       bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
-       unlock_two_nondirectories(&src->v, &dst->v);
-
-       return bch2_err_class(ret);
-}
-
-/* fseek: */
-
-static loff_t bch2_seek_data(struct file *file, u64 offset)
-{
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       subvol_inum inum = inode_inum(inode);
-       u64 isize, next_data = MAX_LFS_FILESIZE;
-
-       isize = i_size_read(&inode->v);
-       if (offset >= isize)
-               return -ENXIO;
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
-                                  POS(inode->v.i_ino, offset >> 9),
-                                  POS(inode->v.i_ino, U64_MAX),
-                                  inum.subvol, 0, k, ({
-                       if (bkey_extent_is_data(k.k)) {
-                               next_data = max(offset, bkey_start_offset(k.k) << 9);
-                               break;
-                       } else if (k.k->p.offset >> 9 > isize)
-                               break;
-                       0;
-               })));
-       if (ret)
-               return ret;
-
-       if (next_data > offset)
-               next_data = bch2_seek_pagecache_data(&inode->v,
-                                       offset, next_data, 0, false);
-
-       if (next_data >= isize)
-               return -ENXIO;
-
-       return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
-}
-
-static loff_t bch2_seek_hole(struct file *file, u64 offset)
-{
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       subvol_inum inum = inode_inum(inode);
-       u64 isize, next_hole = MAX_LFS_FILESIZE;
-
-       isize = i_size_read(&inode->v);
-       if (offset >= isize)
-               return -ENXIO;
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
-                                  POS(inode->v.i_ino, offset >> 9),
-                                  POS(inode->v.i_ino, U64_MAX),
-                                  inum.subvol, BTREE_ITER_slots, k, ({
-                       if (k.k->p.inode != inode->v.i_ino ||
-                           !bkey_extent_is_data(k.k)) {
-                               loff_t start_offset = k.k->p.inode == inode->v.i_ino
-                                       ? max(offset, bkey_start_offset(k.k) << 9)
-                                       : offset;
-                               loff_t end_offset = k.k->p.inode == inode->v.i_ino
-                                       ? MAX_LFS_FILESIZE
-                                       : k.k->p.offset << 9;
-
-                               /*
-                                * Found a hole in the btree, now make sure it's
-                                * a hole in the pagecache. We might have to
-                                * keep searching if this hole is entirely dirty
-                                * in the page cache:
-                                */
-                               bch2_trans_unlock(trans);
-                               loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v,
-                                                               start_offset, end_offset, 0, false);
-                               if (pagecache_hole < end_offset) {
-                                       next_hole = pagecache_hole;
-                                       break;
-                               }
-                       } else {
-                               offset = max(offset, bkey_start_offset(k.k) << 9);
-                       }
-                       0;
-               })));
-       if (ret)
-               return ret;
-
-       if (next_hole > isize)
-               next_hole = isize;
-
-       return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
-}
-
-loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
-{
-       loff_t ret;
-
-       switch (whence) {
-       case SEEK_SET:
-       case SEEK_CUR:
-       case SEEK_END:
-               ret = generic_file_llseek(file, offset, whence);
-               break;
-       case SEEK_DATA:
-               ret = bch2_seek_data(file, offset);
-               break;
-       case SEEK_HOLE:
-               ret = bch2_seek_hole(file, offset);
-               break;
-       default:
-               ret = -EINVAL;
-               break;
-       }
-
-       return bch2_err_class(ret);
-}
-
-void bch2_fs_fsio_exit(struct bch_fs *c)
-{
-       bioset_exit(&c->nocow_flush_bioset);
-}
-
-int bch2_fs_fsio_init(struct bch_fs *c)
-{
-       if (bioset_init(&c->nocow_flush_bioset,
-                       1, offsetof(struct nocow_flush, bio), 0))
-               return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
-
-       return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
deleted file mode 100644 (file)
index ca70346..0000000
+++ /dev/null
@@ -1,184 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_H
-#define _BCACHEFS_FS_IO_H
-
-#ifndef NO_BCACHEFS_FS
-
-#include "buckets.h"
-#include "fs.h"
-#include "io_write_types.h"
-#include "quota.h"
-
-#include <linux/uio.h>
-
-struct folio_vec {
-       struct folio    *fv_folio;
-       size_t          fv_offset;
-       size_t          fv_len;
-};
-
-static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
-{
-
-       struct folio *folio     = page_folio(bv.bv_page);
-       size_t offset           = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
-               bv.bv_offset;
-       size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
-
-       return (struct folio_vec) {
-               .fv_folio       = folio,
-               .fv_offset      = offset,
-               .fv_len         = len,
-       };
-}
-
-static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
-                                                   struct bvec_iter iter)
-{
-       return biovec_to_foliovec(bio_iter_iovec(bio, iter));
-}
-
-#define __bio_for_each_folio(bvl, bio, iter, start)                    \
-       for (iter = (start);                                            \
-            (iter).bi_size &&                                          \
-               ((bvl = bio_iter_iovec_folio((bio), (iter))), 1);       \
-            bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
-
-/**
- * bio_for_each_folio - iterate over folios within a bio
- *
- * Like other non-_all versions, this iterates over what bio->bi_iter currently
- * points to. This version is for drivers, where the bio may have previously
- * been split or cloned.
- */
-#define bio_for_each_folio(bvl, bio, iter)                             \
-       __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
-
-struct quota_res {
-       u64                             sectors;
-};
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-static inline void __bch2_quota_reservation_put(struct bch_fs *c,
-                                        struct bch_inode_info *inode,
-                                        struct quota_res *res)
-{
-       BUG_ON(res->sectors > inode->ei_quota_reserved);
-
-       bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-                       -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
-       inode->ei_quota_reserved -= res->sectors;
-       res->sectors = 0;
-}
-
-static inline void bch2_quota_reservation_put(struct bch_fs *c,
-                                      struct bch_inode_info *inode,
-                                      struct quota_res *res)
-{
-       if (res->sectors) {
-               mutex_lock(&inode->ei_quota_lock);
-               __bch2_quota_reservation_put(c, inode, res);
-               mutex_unlock(&inode->ei_quota_lock);
-       }
-}
-
-static inline int bch2_quota_reservation_add(struct bch_fs *c,
-                                     struct bch_inode_info *inode,
-                                     struct quota_res *res,
-                                     u64 sectors,
-                                     bool check_enospc)
-{
-       int ret;
-
-       if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
-               return 0;
-
-       mutex_lock(&inode->ei_quota_lock);
-       ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
-                             check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
-       if (likely(!ret)) {
-               inode->ei_quota_reserved += sectors;
-               res->sectors += sectors;
-       }
-       mutex_unlock(&inode->ei_quota_lock);
-
-       return ret;
-}
-
-#else
-
-static inline void __bch2_quota_reservation_put(struct bch_fs *c,
-                                        struct bch_inode_info *inode,
-                                        struct quota_res *res) {}
-
-static inline void bch2_quota_reservation_put(struct bch_fs *c,
-                                      struct bch_inode_info *inode,
-                                      struct quota_res *res) {}
-
-static inline int bch2_quota_reservation_add(struct bch_fs *c,
-                                     struct bch_inode_info *inode,
-                                     struct quota_res *res,
-                                     unsigned sectors,
-                                     bool check_enospc)
-{
-       return 0;
-}
-
-#endif
-
-void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
-                          struct quota_res *, s64);
-
-static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-                                      struct quota_res *quota_res, s64 sectors)
-{
-       if (sectors) {
-               mutex_lock(&inode->ei_quota_lock);
-               __bch2_i_sectors_acct(c, inode, quota_res, sectors);
-               mutex_unlock(&inode->ei_quota_lock);
-       }
-}
-
-static inline struct address_space *faults_disabled_mapping(void)
-{
-       return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
-}
-
-static inline void set_fdm_dropped_locks(void)
-{
-       current->faults_disabled_mapping =
-               (void *) (((unsigned long) current->faults_disabled_mapping)|1);
-}
-
-static inline bool fdm_dropped_locks(void)
-{
-       return ((unsigned long) current->faults_disabled_mapping) & 1;
-}
-
-void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
-                       struct bch_inode_info *, struct closure *);
-
-int __must_check bch2_write_inode_size(struct bch_fs *,
-                                      struct bch_inode_info *,
-                                      loff_t, unsigned);
-
-int bch2_fsync(struct file *, loff_t, loff_t, int);
-
-int bchfs_truncate(struct mnt_idmap *,
-                 struct bch_inode_info *, struct iattr *);
-long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-
-loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
-                            loff_t, loff_t, unsigned);
-
-loff_t bch2_llseek(struct file *, loff_t, int);
-
-void bch2_fs_fsio_exit(struct bch_fs *);
-int bch2_fs_fsio_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
deleted file mode 100644 (file)
index 43510da..0000000
+++ /dev/null
@@ -1,440 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "fs.h"
-#include "fs-ioctl.h"
-#include "namei.h"
-#include "quota.h"
-
-#include <linux/compat.h>
-#include <linux/fsnotify.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/security.h>
-#include <linux/writeback.h>
-
-#define FS_IOC_GOINGDOWN            _IOR('X', 125, __u32)
-#define FSOP_GOING_FLAGS_DEFAULT       0x0     /* going down */
-#define FSOP_GOING_FLAGS_LOGFLUSH      0x1     /* flush log but not data */
-#define FSOP_GOING_FLAGS_NOLOGFLUSH    0x2     /* don't flush log nor data */
-
-static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
-                                  struct bch_inode_info *inode,
-                                  struct bch_inode_unpacked *bi,
-                                  void *p)
-{
-       struct bch_inode_info *dir = p;
-
-       return !bch2_reinherit_attrs(bi, &dir->ei_inode);
-}
-
-static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
-                                   struct file *file,
-                                   struct bch_inode_info *src,
-                                   const char __user *name)
-{
-       struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
-       struct bch_inode_info *dst;
-       struct inode *vinode = NULL;
-       char *kname = NULL;
-       struct qstr qstr;
-       int ret = 0;
-       subvol_inum inum;
-
-       kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
-       if (!kname)
-               return -ENOMEM;
-
-       ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
-       if (unlikely(ret < 0))
-               goto err1;
-
-       qstr.len        = ret;
-       qstr.name       = kname;
-
-       ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
-       if (ret)
-               goto err1;
-
-       vinode = bch2_vfs_inode_get(c, inum);
-       ret = PTR_ERR_OR_ZERO(vinode);
-       if (ret)
-               goto err1;
-
-       dst = to_bch_ei(vinode);
-
-       ret = mnt_want_write_file(file);
-       if (ret)
-               goto err2;
-
-       bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
-
-       if (inode_attr_changing(src, dst, Inode_opt_project)) {
-               ret = bch2_fs_quota_transfer(c, dst,
-                                            src->ei_qid,
-                                            1 << QTYP_PRJ,
-                                            KEY_TYPE_QUOTA_PREALLOC);
-               if (ret)
-                       goto err3;
-       }
-
-       ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
-err3:
-       bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
-
-       /* return true if we did work */
-       if (ret >= 0)
-               ret = !ret;
-
-       mnt_drop_write_file(file);
-err2:
-       iput(vinode);
-err1:
-       kfree(kname);
-
-       return ret;
-}
-
-static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg)
-{
-       return put_user(inode->v.i_generation, arg);
-}
-
-static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label)
-{
-       int ret;
-       size_t len;
-       char label[BCH_SB_LABEL_SIZE];
-
-       BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX);
-
-       mutex_lock(&c->sb_lock);
-       memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
-       mutex_unlock(&c->sb_lock);
-
-       len = strnlen(label, BCH_SB_LABEL_SIZE);
-       if (len == BCH_SB_LABEL_SIZE) {
-               bch_warn(c,
-                       "label is too long, return the first %zu bytes",
-                       --len);
-       }
-
-       ret = copy_to_user(user_label, label, len);
-
-       return ret ? -EFAULT : 0;
-}
-
-static int bch2_ioc_setlabel(struct bch_fs *c,
-                            struct file *file,
-                            struct bch_inode_info *inode,
-                            const char __user *user_label)
-{
-       int ret;
-       char label[BCH_SB_LABEL_SIZE];
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if (copy_from_user(label, user_label, sizeof(label)))
-               return -EFAULT;
-
-       if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) {
-               bch_err(c,
-                       "unable to set label with more than %d bytes",
-                       BCH_SB_LABEL_SIZE - 1);
-               return -EINVAL;
-       }
-
-       ret = mnt_want_write_file(file);
-       if (ret)
-               return ret;
-
-       mutex_lock(&c->sb_lock);
-       strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
-       ret = bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       mnt_drop_write_file(file);
-       return ret;
-}
-
-static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
-{
-       u32 flags;
-       int ret = 0;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if (get_user(flags, arg))
-               return -EFAULT;
-
-       struct printbuf buf = PRINTBUF;
-       bch2_log_msg_start(c, &buf);
-
-       prt_printf(&buf, "shutdown by ioctl type %u", flags);
-
-       switch (flags) {
-       case FSOP_GOING_FLAGS_DEFAULT:
-               ret = bdev_freeze(c->vfs_sb->s_bdev);
-               if (ret)
-                       break;
-               bch2_journal_flush(&c->journal);
-               bch2_fs_emergency_read_only2(c, &buf);
-               bdev_thaw(c->vfs_sb->s_bdev);
-               break;
-       case FSOP_GOING_FLAGS_LOGFLUSH:
-               bch2_journal_flush(&c->journal);
-               fallthrough;
-       case FSOP_GOING_FLAGS_NOLOGFLUSH:
-               bch2_fs_emergency_read_only2(c, &buf);
-               break;
-       default:
-               ret = -EINVAL;
-               goto noprint;
-       }
-
-       bch2_print_str(c, KERN_ERR, buf.buf);
-noprint:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
-                                       struct bch_ioctl_subvolume arg)
-{
-       struct inode *dir;
-       struct bch_inode_info *inode;
-       struct user_namespace *s_user_ns;
-       struct dentry *dst_dentry;
-       struct path src_path, dst_path;
-       int how = LOOKUP_FOLLOW;
-       int error;
-       subvol_inum snapshot_src = { 0 };
-       unsigned lookup_flags = 0;
-       unsigned create_flags = BCH_CREATE_SUBVOL;
-
-       if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
-                         BCH_SUBVOL_SNAPSHOT_RO))
-               return -EINVAL;
-
-       if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
-           (arg.src_ptr ||
-            (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
-               return -EINVAL;
-
-       if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
-               create_flags |= BCH_CREATE_SNAPSHOT;
-
-       if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
-               create_flags |= BCH_CREATE_SNAPSHOT_RO;
-
-       if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
-               /* sync_inodes_sb enforce s_umount is locked */
-               down_read(&c->vfs_sb->s_umount);
-               sync_inodes_sb(c->vfs_sb);
-               up_read(&c->vfs_sb->s_umount);
-       }
-
-       if (arg.src_ptr) {
-               error = user_path_at(arg.dirfd,
-                               (const char __user *)(unsigned long)arg.src_ptr,
-                               how, &src_path);
-               if (error)
-                       goto err1;
-
-               if (src_path.dentry->d_sb->s_fs_info != c) {
-                       path_put(&src_path);
-                       error = -EXDEV;
-                       goto err1;
-               }
-
-               snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
-       }
-
-       dst_dentry = start_creating_user_path(arg.dirfd,
-                       (const char __user *)(unsigned long)arg.dst_ptr,
-                       &dst_path, lookup_flags);
-       error = PTR_ERR_OR_ZERO(dst_dentry);
-       if (error)
-               goto err2;
-
-       if (dst_dentry->d_sb->s_fs_info != c) {
-               error = -EXDEV;
-               goto err3;
-       }
-
-       if (dst_dentry->d_inode) {
-               error = bch_err_throw(c, EEXIST_subvolume_create);
-               goto err3;
-       }
-
-       dir = dst_path.dentry->d_inode;
-       if (IS_DEADDIR(dir)) {
-               error = bch_err_throw(c, ENOENT_directory_dead);
-               goto err3;
-       }
-
-       s_user_ns = dir->i_sb->s_user_ns;
-       if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
-           !kgid_has_mapping(s_user_ns, current_fsgid())) {
-               error = -EOVERFLOW;
-               goto err3;
-       }
-
-       error = inode_permission(file_mnt_idmap(filp),
-                                dir, MAY_WRITE | MAY_EXEC);
-       if (error)
-               goto err3;
-
-       if (!IS_POSIXACL(dir))
-               arg.mode &= ~current_umask();
-
-       error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
-       if (error)
-               goto err3;
-
-       if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
-           !arg.src_ptr)
-               snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
-
-       down_write(&c->snapshot_create_lock);
-       inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
-                             dst_dentry, arg.mode|S_IFDIR,
-                             0, snapshot_src, create_flags);
-       up_write(&c->snapshot_create_lock);
-
-       error = PTR_ERR_OR_ZERO(inode);
-       if (error)
-               goto err3;
-
-       d_instantiate(dst_dentry, &inode->v);
-       fsnotify_mkdir(dir, dst_dentry);
-err3:
-       end_creating_path(&dst_path, dst_dentry);
-err2:
-       if (arg.src_ptr)
-               path_put(&src_path);
-err1:
-       return error;
-}
-
-static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
-                               struct bch_ioctl_subvolume arg)
-{
-       const char __user *name = (void __user *)(unsigned long)arg.dst_ptr;
-       struct path path;
-       struct inode *dir;
-       struct dentry *victim;
-       int ret = 0;
-
-       if (arg.flags)
-               return -EINVAL;
-
-       victim = start_removing_user_path_at(arg.dirfd, name, &path);
-       if (IS_ERR(victim))
-               return PTR_ERR(victim);
-
-       dir = d_inode(path.dentry);
-       if (victim->d_sb->s_fs_info != c) {
-               ret = -EXDEV;
-               goto err;
-       }
-
-       ret =   inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?:
-               __bch2_unlink(dir, victim, true);
-       if (!ret) {
-               fsnotify_rmdir(dir, victim);
-               d_invalidate(victim);
-       }
-err:
-       end_removing_path(&path, victim);
-       return ret;
-}
-
-long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       long ret;
-
-       switch (cmd) {
-       case BCHFS_IOC_REINHERIT_ATTRS:
-               ret = bch2_ioc_reinherit_attrs(c, file, inode,
-                                              (void __user *) arg);
-               break;
-
-       case FS_IOC_GETVERSION:
-               ret = bch2_ioc_getversion(inode, (u32 __user *) arg);
-               break;
-
-       case FS_IOC_SETVERSION:
-               ret = -ENOTTY;
-               break;
-
-       case FS_IOC_GETFSLABEL:
-               ret = bch2_ioc_getlabel(c, (void __user *) arg);
-               break;
-
-       case FS_IOC_SETFSLABEL:
-               ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg);
-               break;
-
-       case FS_IOC_GOINGDOWN:
-               ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
-               break;
-
-       case BCH_IOCTL_SUBVOLUME_CREATE: {
-               struct bch_ioctl_subvolume i;
-
-               ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
-                       ? -EFAULT
-                       : bch2_ioctl_subvolume_create(c, file, i);
-               break;
-       }
-
-       case BCH_IOCTL_SUBVOLUME_DESTROY: {
-               struct bch_ioctl_subvolume i;
-
-               ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
-                       ? -EFAULT
-                       : bch2_ioctl_subvolume_destroy(c, file, i);
-               break;
-       }
-
-       default:
-               ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
-               break;
-       }
-
-       return bch2_err_class(ret);
-}
-
-#ifdef CONFIG_COMPAT
-long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
-       /* These are just misnamed, they actually get/put from/to user an int */
-       switch (cmd) {
-       case FS_IOC32_GETFLAGS:
-               cmd = FS_IOC_GETFLAGS;
-               break;
-       case FS_IOC32_SETFLAGS:
-               cmd = FS_IOC_SETFLAGS;
-               break;
-       case FS_IOC32_GETVERSION:
-               cmd = FS_IOC_GETVERSION;
-               break;
-       case FS_IOC_GETFSLABEL:
-       case FS_IOC_SETFSLABEL:
-               break;
-       default:
-               return -ENOIOCTLCMD;
-       }
-       return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
deleted file mode 100644 (file)
index a657e49..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IOCTL_H
-#define _BCACHEFS_FS_IOCTL_H
-
-long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
-long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
-
-#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
deleted file mode 100644 (file)
index 687af0e..0000000
+++ /dev/null
@@ -1,2768 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "errcode.h"
-#include "extents.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-ioctl.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_read.h"
-#include "journal.h"
-#include "keylist.h"
-#include "namei.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "snapshot.h"
-#include "super.h"
-#include "xattr.h"
-#include "trace.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/exportfs.h>
-#include <linux/fiemap.h>
-#include <linux/fileattr.h>
-#include <linux/fs_context.h>
-#include <linux/module.h>
-#include <linux/pagemap.h>
-#include <linux/posix_acl.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <linux/siphash.h>
-#include <linux/statfs.h>
-#include <linux/string.h>
-#include <linux/xattr.h>
-
-static struct kmem_cache *bch2_inode_cache;
-
-static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
-                               struct bch_inode_info *,
-                               struct bch_inode_unpacked *,
-                               struct bch_subvolume *);
-
-/* Set VFS inode flags from bcachefs inode: */
-static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode)
-{
-       static const __maybe_unused unsigned bch_flags_to_vfs[] = {
-               [__BCH_INODE_sync]              = S_SYNC,
-               [__BCH_INODE_immutable]         = S_IMMUTABLE,
-               [__BCH_INODE_append]            = S_APPEND,
-               [__BCH_INODE_noatime]           = S_NOATIME,
-       };
-
-       set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
-
-       if (bch2_inode_casefold(c, &inode->ei_inode))
-               inode->v.i_flags |= S_CASEFOLD;
-       else
-               inode->v.i_flags &= ~S_CASEFOLD;
-}
-
-void bch2_inode_update_after_write(struct btree_trans *trans,
-                                  struct bch_inode_info *inode,
-                                  struct bch_inode_unpacked *bi,
-                                  unsigned fields)
-{
-       struct bch_fs *c = trans->c;
-
-       BUG_ON(bi->bi_inum != inode->v.i_ino);
-
-       bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
-
-       set_nlink(&inode->v, bch2_inode_nlink_get(bi));
-       i_uid_write(&inode->v, bi->bi_uid);
-       i_gid_write(&inode->v, bi->bi_gid);
-       inode->v.i_mode = bi->bi_mode;
-
-       if (fields & ATTR_SIZE)
-               i_size_write(&inode->v, bi->bi_size);
-
-       if (fields & ATTR_ATIME)
-               inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
-       if (fields & ATTR_MTIME)
-               inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
-       if (fields & ATTR_CTIME)
-               inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
-
-       inode->ei_inode         = *bi;
-
-       bch2_inode_flags_to_vfs(c, inode);
-}
-
-int __must_check bch2_write_inode(struct bch_fs *c,
-                                 struct bch_inode_info *inode,
-                                 inode_set_fn set,
-                                 void *p, unsigned fields)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter = {};
-       struct bch_inode_unpacked inode_u;
-       int ret;
-retry:
-       bch2_trans_begin(trans);
-
-       ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent);
-       if (ret)
-               goto err;
-
-       struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u);
-
-       ret = (set ? set(trans, inode, &inode_u, p) : 0);
-       if (ret)
-               goto err;
-
-       struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
-       bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r));
-
-       if (rebalance_changed) {
-               ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
-               if (ret)
-                       goto err;
-       }
-
-       ret   = bch2_inode_write(trans, &iter, &inode_u) ?:
-               bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-
-       /*
-        * the btree node lock protects inode->ei_inode, not ei_update_lock;
-        * this is important for inode updates via bchfs_write_index_update
-        */
-       if (!ret)
-               bch2_inode_update_after_write(trans, inode, &inode_u, fields);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-
-       if (rebalance_changed)
-               bch2_rebalance_wakeup(c);
-
-       bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
-                            "%s: inode %llu:%llu not found when updating",
-                            bch2_err_str(ret),
-                            inode_inum(inode).subvol,
-                            inode_inum(inode).inum);
-
-       bch2_trans_put(trans);
-       return ret < 0 ? ret : 0;
-}
-
-int bch2_fs_quota_transfer(struct bch_fs *c,
-                          struct bch_inode_info *inode,
-                          struct bch_qid new_qid,
-                          unsigned qtypes,
-                          enum quota_acct_mode mode)
-{
-       unsigned i;
-       int ret;
-
-       qtypes &= enabled_qtypes(c);
-
-       for (i = 0; i < QTYP_NR; i++)
-               if (new_qid.q[i] == inode->ei_qid.q[i])
-                       qtypes &= ~(1U << i);
-
-       if (!qtypes)
-               return 0;
-
-       mutex_lock(&inode->ei_quota_lock);
-
-       ret = bch2_quota_transfer(c, qtypes, new_qid,
-                                 inode->ei_qid,
-                                 inode->v.i_blocks +
-                                 inode->ei_quota_reserved,
-                                 mode);
-       if (!ret)
-               for (i = 0; i < QTYP_NR; i++)
-                       if (qtypes & (1 << i))
-                               inode->ei_qid.q[i] = new_qid.q[i];
-
-       mutex_unlock(&inode->ei_quota_lock);
-
-       return ret;
-}
-
-static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
-{
-       const subvol_inum *inum = data;
-       siphash_key_t k = { .key[0] = seed };
-
-       return siphash_2u64(inum->subvol, inum->inum, &k);
-}
-
-static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
-{
-       const struct bch_inode_info *inode = data;
-
-       return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
-}
-
-static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
-                                const void *obj)
-{
-       const struct bch_inode_info *inode = obj;
-       const subvol_inum *v = arg->key;
-
-       return !subvol_inum_eq(inode->ei_inum, *v);
-}
-
-static const struct rhashtable_params bch2_vfs_inodes_params = {
-       .head_offset            = offsetof(struct bch_inode_info, hash),
-       .key_offset             = offsetof(struct bch_inode_info, ei_inum),
-       .key_len                = sizeof(subvol_inum),
-       .hashfn                 = bch2_vfs_inode_hash_fn,
-       .obj_hashfn             = bch2_vfs_inode_obj_hash_fn,
-       .obj_cmpfn              = bch2_vfs_inode_cmp_fn,
-       .automatic_shrinking    = true,
-};
-
-static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = {
-       .head_offset            = offsetof(struct bch_inode_info, by_inum_hash),
-       .key_offset             = offsetof(struct bch_inode_info, ei_inum.inum),
-       .key_len                = sizeof(u64),
-       .automatic_shrinking    = true,
-};
-
-int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
-{
-       struct bch_fs *c = trans->c;
-       struct rhltable *ht = &c->vfs_inodes_by_inum_table;
-       u64 inum = p.offset;
-       DARRAY(u32) subvols;
-       int ret = 0;
-
-       if (!test_bit(BCH_FS_started, &c->flags))
-               return false;
-
-       darray_init(&subvols);
-restart_from_top:
-
-       /*
-        * Tweaked version of __rhashtable_lookup(); we need to get a list of
-        * subvolumes in which the given inode number is open.
-        *
-        * For this to work, we don't include the subvolume ID in the key that
-        * we hash - all inodes with the same inode number regardless of
-        * subvolume will hash to the same slot.
-        *
-        * This will be less than ideal if the same file is ever open
-        * simultaneously in many different snapshots:
-        */
-       rcu_read_lock();
-       struct rhash_lock_head __rcu *const *bkt;
-       struct rhash_head *he;
-       unsigned int hash;
-       struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht);
-restart:
-       hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params);
-       bkt = rht_bucket(tbl, hash);
-       do {
-               struct bch_inode_info *inode;
-
-               rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
-                       if (inode->ei_inum.inum == inum) {
-                               ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
-                                                     GFP_NOWAIT|__GFP_NOWARN);
-                               if (ret) {
-                                       rcu_read_unlock();
-                                       ret = darray_make_room(&subvols, 1);
-                                       if (ret)
-                                               goto err;
-                                       subvols.nr = 0;
-                                       goto restart_from_top;
-                               }
-                       }
-               }
-               /* An object might have been moved to a different hash chain,
-                * while we walk along it - better check and retry.
-                */
-       } while (he != RHT_NULLS_MARKER(bkt));
-
-       /* Ensure we see any new tables. */
-       smp_rmb();
-
-       tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht);
-       if (unlikely(tbl))
-               goto restart;
-       rcu_read_unlock();
-
-       darray_for_each(subvols, i) {
-               u32 snap;
-               ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
-               if (ret)
-                       goto err;
-
-               ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
-               if (ret)
-                       break;
-       }
-err:
-       darray_exit(&subvols);
-       return ret;
-}
-
-static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
-{
-       return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
-}
-
-static void __wait_on_freeing_inode(struct bch_fs *c,
-                                   struct bch_inode_info *inode,
-                                   subvol_inum inum)
-{
-       wait_queue_head_t *wq;
-       struct wait_bit_queue_entry wait;
-
-       wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
-       prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-       spin_unlock(&inode->v.i_lock);
-
-       if (__bch2_inode_hash_find(c, inum) == inode)
-               schedule_timeout(HZ * 10);
-       finish_wait(wq, &wait.wq_entry);
-}
-
-static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
-                                                  subvol_inum inum)
-{
-       struct bch_inode_info *inode;
-repeat:
-       inode = __bch2_inode_hash_find(c, inum);
-       if (inode) {
-               spin_lock(&inode->v.i_lock);
-               if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
-                       spin_unlock(&inode->v.i_lock);
-                       return NULL;
-               }
-               if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
-                       if (!trans) {
-                               __wait_on_freeing_inode(c, inode, inum);
-                       } else {
-                               int ret = drop_locks_do(trans,
-                                               (__wait_on_freeing_inode(c, inode, inum), 0));
-                               if (ret)
-                                       return ERR_PTR(ret);
-                       }
-                       goto repeat;
-               }
-               __iget(&inode->v);
-               spin_unlock(&inode->v.i_lock);
-       }
-
-       return inode;
-}
-
-static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
-{
-       spin_lock(&inode->v.i_lock);
-       bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
-       spin_unlock(&inode->v.i_lock);
-
-       if (remove) {
-               int ret = rhltable_remove(&c->vfs_inodes_by_inum_table,
-                                       &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params);
-               BUG_ON(ret);
-
-               ret = rhashtable_remove_fast(&c->vfs_inodes_table,
-                                       &inode->hash, bch2_vfs_inodes_params);
-               BUG_ON(ret);
-               inode->v.i_hash.pprev = NULL;
-               /*
-                * This pairs with the bch2_inode_hash_find() ->
-                * __wait_on_freeing_inode() path
-                */
-               inode_wake_up_bit(&inode->v, __I_NEW);
-       }
-}
-
-static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
-                                                    struct btree_trans *trans,
-                                                    struct bch_inode_info *inode)
-{
-       struct bch_inode_info *old = inode;
-
-       set_bit(EI_INODE_HASHED, &inode->ei_flags);
-retry:
-       if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
-                                       &inode->ei_inum,
-                                       &inode->hash,
-                                       bch2_vfs_inodes_params))) {
-               old = bch2_inode_hash_find(c, trans, inode->ei_inum);
-               if (!old)
-                       goto retry;
-
-               clear_bit(EI_INODE_HASHED, &inode->ei_flags);
-
-               /*
-                * bcachefs doesn't use I_NEW; we have no use for it since we
-                * only insert fully created inodes in the inode hash table. But
-                * discard_new_inode() expects it to be set...
-                */
-               inode->v.i_state |= I_NEW;
-               /*
-                * We don't want bch2_evict_inode() to delete the inode on disk,
-                * we just raced and had another inode in cache. Normally new
-                * inodes don't have nlink == 0 - except tmpfiles do...
-                */
-               set_nlink(&inode->v, 1);
-               discard_new_inode(&inode->v);
-               return old;
-       } else {
-               int ret = rhltable_insert(&c->vfs_inodes_by_inum_table,
-                                         &inode->by_inum_hash,
-                                         bch2_vfs_inodes_by_inum_params);
-               BUG_ON(ret);
-
-               inode_fake_hash(&inode->v);
-
-               inode_sb_list_add(&inode->v);
-
-               mutex_lock(&c->vfs_inodes_lock);
-               list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-               mutex_unlock(&c->vfs_inodes_lock);
-               return inode;
-       }
-}
-
-#define memalloc_flags_do(_flags, _do)                                         \
-({                                                                             \
-       unsigned _saved_flags = memalloc_flags_save(_flags);                    \
-       typeof(_do) _ret = _do;                                                 \
-       memalloc_noreclaim_restore(_saved_flags);                               \
-       _ret;                                                                   \
-})
-
-static struct inode *bch2_alloc_inode(struct super_block *sb)
-{
-       BUG();
-}
-
-static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
-{
-       struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
-                                               bch2_inode_cache, gfp);
-       if (!inode)
-               return NULL;
-
-       inode_init_once(&inode->v);
-       mutex_init(&inode->ei_update_lock);
-       two_state_lock_init(&inode->ei_pagecache_lock);
-       INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
-       inode->ei_flags = 0;
-       mutex_init(&inode->ei_quota_lock);
-       memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
-       if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
-               kmem_cache_free(bch2_inode_cache, inode);
-               return NULL;
-       }
-
-       return inode;
-}
-
-/*
- * Allocate a new inode, dropping/retaking btree locks if necessary:
- */
-static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
-{
-       struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
-
-       if (unlikely(!inode)) {
-               int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
-               if (ret && inode) {
-                       __destroy_inode(&inode->v);
-                       kmem_cache_free(bch2_inode_cache, inode);
-               }
-               if (ret)
-                       return ERR_PTR(ret);
-       }
-
-       return inode;
-}
-
-static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
-                                                         subvol_inum inum,
-                                                         struct bch_inode_unpacked *bi,
-                                                         struct bch_subvolume *subvol)
-{
-       struct bch_inode_info *inode = bch2_new_inode(trans);
-       if (IS_ERR(inode))
-               return inode;
-
-       bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
-
-       return bch2_inode_hash_insert(trans->c, trans, inode);
-
-}
-
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
-{
-       struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
-       if (inode)
-               return &inode->v;
-
-       struct btree_trans *trans = bch2_trans_get(c);
-
-       struct bch_inode_unpacked inode_u;
-       struct bch_subvolume subvol;
-       int ret = lockrestart_do(trans,
-               bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
-               bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
-               PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
-       bch2_trans_put(trans);
-
-       return ret ? ERR_PTR(ret) : &inode->v;
-}
-
-struct bch_inode_info *
-__bch2_create(struct mnt_idmap *idmap,
-             struct bch_inode_info *dir, struct dentry *dentry,
-             umode_t mode, dev_t rdev, subvol_inum snapshot_src,
-             unsigned flags)
-{
-       struct bch_fs *c = dir->v.i_sb->s_fs_info;
-       struct btree_trans *trans;
-       struct bch_inode_unpacked dir_u;
-       struct bch_inode_info *inode;
-       struct bch_inode_unpacked inode_u;
-       struct posix_acl *default_acl = NULL, *acl = NULL;
-       subvol_inum inum;
-       struct bch_subvolume subvol;
-       u64 journal_seq = 0;
-       kuid_t kuid;
-       kgid_t kgid;
-       int ret;
-
-       /*
-        * preallocate acls + vfs inode before btree transaction, so that
-        * nothing can fail after the transaction succeeds:
-        */
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-       ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
-       if (ret)
-               return ERR_PTR(ret);
-#endif
-       inode = __bch2_new_inode(c, GFP_NOFS);
-       if (unlikely(!inode)) {
-               inode = ERR_PTR(-ENOMEM);
-               goto err;
-       }
-
-       bch2_inode_init_early(c, &inode_u);
-
-       if (!(flags & BCH_CREATE_TMPFILE))
-               mutex_lock(&dir->ei_update_lock);
-
-       trans = bch2_trans_get(c);
-retry:
-       bch2_trans_begin(trans);
-
-       kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
-       kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
-       ret   = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
-               bch2_create_trans(trans,
-                                 inode_inum(dir), &dir_u, &inode_u,
-                                 !(flags & BCH_CREATE_TMPFILE)
-                                 ? &dentry->d_name : NULL,
-                                 from_kuid(i_user_ns(&dir->v), kuid),
-                                 from_kgid(i_user_ns(&dir->v), kgid),
-                                 mode, rdev,
-                                 default_acl, acl, snapshot_src, flags) ?:
-               bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
-                               KEY_TYPE_QUOTA_PREALLOC);
-       if (unlikely(ret))
-               goto err_before_quota;
-
-       inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
-       inum.inum = inode_u.bi_inum;
-
-       ret   = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
-               bch2_trans_commit(trans, NULL, &journal_seq, 0);
-       if (unlikely(ret)) {
-               bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
-                               KEY_TYPE_QUOTA_WARN);
-err_before_quota:
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       goto retry;
-               goto err_trans;
-       }
-
-       if (!(flags & BCH_CREATE_TMPFILE)) {
-               bch2_inode_update_after_write(trans, dir, &dir_u,
-                                             ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-               mutex_unlock(&dir->ei_update_lock);
-       }
-
-       bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-
-       set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
-       set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
-
-       /*
-        * we must insert the new inode into the inode cache before calling
-        * bch2_trans_exit() and dropping locks, else we could race with another
-        * thread pulling the inode in and modifying it:
-        *
-        * also, calling bch2_inode_hash_insert() without passing in the
-        * transaction object is sketchy - if we could ever end up in
-        * __wait_on_freeing_inode(), we'd risk deadlock.
-        *
-        * But that shouldn't be possible, since we still have the inode locked
-        * that we just created, and we _really_ can't take a transaction
-        * restart here.
-        */
-       inode = bch2_inode_hash_insert(c, NULL, inode);
-       bch2_trans_put(trans);
-err:
-       posix_acl_release(default_acl);
-       posix_acl_release(acl);
-       return inode;
-err_trans:
-       if (!(flags & BCH_CREATE_TMPFILE))
-               mutex_unlock(&dir->ei_update_lock);
-
-       bch2_trans_put(trans);
-       make_bad_inode(&inode->v);
-       iput(&inode->v);
-       inode = ERR_PTR(ret);
-       goto err;
-}
-
-/* methods */
-
-static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
-                       subvol_inum dir, struct bch_hash_info *dir_hash_info,
-                       const struct qstr *name)
-{
-       struct bch_fs *c = trans->c;
-       subvol_inum inum = {};
-       struct printbuf buf = PRINTBUF;
-
-       struct qstr lookup_name;
-       int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name);
-       if (ret)
-               return ERR_PTR(ret);
-
-       struct btree_iter dirent_iter = {};
-       struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
-                                            dir_hash_info, dir, &lookup_name, 0);
-       ret = bkey_err(k);
-       if (ret)
-               return ERR_PTR(ret);
-
-       struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-       ret = bch2_dirent_read_target(trans, dir, d, &inum);
-       if (ret > 0)
-               ret = -ENOENT;
-       if (ret)
-               goto err;
-
-       struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
-       if (inode)
-               goto out;
-
-       /*
-        * Note: if check/repair needs it, we commit before
-        * bch2_inode_hash_init_insert(), as after that point we can't take a
-        * restart - not in the top level loop with a commit_do(), like we
-        * usually do:
-        */
-
-       struct bch_subvolume subvol;
-       struct bch_inode_unpacked inode_u;
-       ret =   bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
-               bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
-               bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?:
-               bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-               PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
-
-       /*
-        * don't remove it: check_inodes might find another inode that points
-        * back to this dirent
-        */
-       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
-                               c, "dirent to missing inode:\n%s",
-                               (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf));
-       if (ret)
-               goto err;
-out:
-       bch2_trans_iter_exit(trans, &dirent_iter);
-       printbuf_exit(&buf);
-       return inode;
-err:
-       inode = ERR_PTR(ret);
-       goto out;
-}
-
-static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
-                                 unsigned int flags)
-{
-       struct bch_fs *c = vdir->i_sb->s_fs_info;
-       struct bch_inode_info *dir = to_bch_ei(vdir);
-       struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
-
-       struct bch_inode_info *inode;
-       bch2_trans_do(c,
-               PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
-                                                         &hash, &dentry->d_name)));
-       if (IS_ERR(inode))
-               inode = NULL;
-
-       if (!inode && IS_CASEFOLDED(vdir)) {
-               /*
-                * Do not cache a negative dentry in casefolded directories
-                * as it would need to be invalidated in the following situation:
-                * - Lookup file "blAH" in a casefolded directory
-                * - Creation of file "BLAH" in a casefolded directory
-                * - Lookup file "blAH" in a casefolded directory
-                * which would fail if we had a negative dentry.
-                *
-                * We should come back to this when VFS has a method to handle
-                * this edgecase.
-                */
-               return NULL;
-       }
-
-       return d_splice_alias(&inode->v, dentry);
-}
-
-static int bch2_mknod(struct mnt_idmap *idmap,
-                     struct inode *vdir, struct dentry *dentry,
-                     umode_t mode, dev_t rdev)
-{
-       struct bch_inode_info *inode =
-               __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
-                             (subvol_inum) { 0 }, 0);
-
-       if (IS_ERR(inode))
-               return bch2_err_class(PTR_ERR(inode));
-
-       d_instantiate(dentry, &inode->v);
-       return 0;
-}
-
-static int bch2_create(struct mnt_idmap *idmap,
-                      struct inode *vdir, struct dentry *dentry,
-                      umode_t mode, bool excl)
-{
-       return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
-}
-
-static int __bch2_link(struct bch_fs *c,
-                      struct bch_inode_info *inode,
-                      struct bch_inode_info *dir,
-                      struct dentry *dentry)
-{
-       struct bch_inode_unpacked dir_u, inode_u;
-       int ret;
-
-       mutex_lock(&inode->ei_update_lock);
-       struct btree_trans *trans = bch2_trans_get(c);
-
-       ret = commit_do(trans, NULL, NULL, 0,
-                       bch2_link_trans(trans,
-                                       inode_inum(dir),   &dir_u,
-                                       inode_inum(inode), &inode_u,
-                                       &dentry->d_name));
-
-       if (likely(!ret)) {
-               bch2_inode_update_after_write(trans, dir, &dir_u,
-                                             ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-               bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
-       }
-
-       bch2_trans_put(trans);
-       mutex_unlock(&inode->ei_update_lock);
-       return ret;
-}
-
-static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
-                    struct dentry *dentry)
-{
-       struct bch_fs *c = vdir->i_sb->s_fs_info;
-       struct bch_inode_info *dir = to_bch_ei(vdir);
-       struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
-       int ret;
-
-       lockdep_assert_held(&inode->v.i_rwsem);
-
-       ret   = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
-               bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
-               __bch2_link(c, inode, dir, dentry);
-       if (unlikely(ret))
-               return bch2_err_class(ret);
-
-       ihold(&inode->v);
-       d_instantiate(dentry, &inode->v);
-       return 0;
-}
-
-int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
-                 bool deleting_snapshot)
-{
-       struct bch_fs *c = vdir->i_sb->s_fs_info;
-       struct bch_inode_info *dir = to_bch_ei(vdir);
-       struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-       struct bch_inode_unpacked dir_u, inode_u;
-       int ret;
-
-       bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
-
-       struct btree_trans *trans = bch2_trans_get(c);
-
-       ret = commit_do(trans, NULL, NULL,
-                       BCH_TRANS_COMMIT_no_enospc,
-               bch2_unlink_trans(trans,
-                                 inode_inum(dir), &dir_u,
-                                 &inode_u, &dentry->d_name,
-                                 deleting_snapshot));
-       if (unlikely(ret))
-               goto err;
-
-       bch2_inode_update_after_write(trans, dir, &dir_u,
-                                     ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-       bch2_inode_update_after_write(trans, inode, &inode_u,
-                                     ATTR_MTIME);
-
-       if (inode_u.bi_subvol) {
-               /*
-                * Subvolume deletion is asynchronous, but we still want to tell
-                * the VFS that it's been deleted here:
-                */
-               set_nlink(&inode->v, 0);
-       }
-
-       if (IS_CASEFOLDED(vdir))
-               d_invalidate(dentry);
-err:
-       bch2_trans_put(trans);
-       bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
-
-       return ret;
-}
-
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
-{
-       struct bch_inode_info *dir= to_bch_ei(vdir);
-       struct bch_fs *c = dir->v.i_sb->s_fs_info;
-
-       int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
-               __bch2_unlink(vdir, dentry, false);
-       return bch2_err_class(ret);
-}
-
-static int bch2_symlink(struct mnt_idmap *idmap,
-                       struct inode *vdir, struct dentry *dentry,
-                       const char *symname)
-{
-       struct bch_fs *c = vdir->i_sb->s_fs_info;
-       struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
-       int ret;
-
-       inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-                             (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-       if (IS_ERR(inode))
-               return bch2_err_class(PTR_ERR(inode));
-
-       inode_lock(&inode->v);
-       ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
-       inode_unlock(&inode->v);
-
-       if (unlikely(ret))
-               goto err;
-
-       ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
-       if (unlikely(ret))
-               goto err;
-
-       ret = __bch2_link(c, inode, dir, dentry);
-       if (unlikely(ret))
-               goto err;
-
-       d_instantiate(dentry, &inode->v);
-       return 0;
-err:
-       iput(&inode->v);
-       return bch2_err_class(ret);
-}
-
-static struct dentry *bch2_mkdir(struct mnt_idmap *idmap,
-                                struct inode *vdir, struct dentry *dentry, umode_t mode)
-{
-       return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0));
-}
-
-static int bch2_rename2(struct mnt_idmap *idmap,
-                       struct inode *src_vdir, struct dentry *src_dentry,
-                       struct inode *dst_vdir, struct dentry *dst_dentry,
-                       unsigned flags)
-{
-       struct bch_fs *c = src_vdir->i_sb->s_fs_info;
-       struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
-       struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
-       struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
-       struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
-       struct bch_inode_unpacked dst_dir_u, src_dir_u;
-       struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
-       struct btree_trans *trans;
-       enum bch_rename_mode mode = flags & RENAME_EXCHANGE
-               ? BCH_RENAME_EXCHANGE
-               : dst_dentry->d_inode
-               ? BCH_RENAME_OVERWRITE : BCH_RENAME;
-       bool whiteout = !!(flags & RENAME_WHITEOUT);
-       int ret;
-
-       if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
-               return -EINVAL;
-
-       if (mode == BCH_RENAME_OVERWRITE) {
-               ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
-                                                  0, LLONG_MAX);
-               if (ret)
-                       return ret;
-       }
-
-       bch2_lock_inodes(INODE_UPDATE_LOCK,
-                        src_dir,
-                        dst_dir,
-                        src_inode,
-                        dst_inode);
-
-       trans = bch2_trans_get(c);
-
-       ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
-               bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
-       if (ret)
-               goto err_tx_restart;
-
-       if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
-               ret = bch2_fs_quota_transfer(c, src_inode,
-                                            dst_dir->ei_qid,
-                                            1 << QTYP_PRJ,
-                                            KEY_TYPE_QUOTA_PREALLOC);
-               if (ret)
-                       goto err;
-       }
-
-       if (mode == BCH_RENAME_EXCHANGE &&
-           inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
-               ret = bch2_fs_quota_transfer(c, dst_inode,
-                                            src_dir->ei_qid,
-                                            1 << QTYP_PRJ,
-                                            KEY_TYPE_QUOTA_PREALLOC);
-               if (ret)
-                       goto err;
-       }
-retry:
-       bch2_trans_begin(trans);
-
-       ret = bch2_rename_trans(trans,
-                               inode_inum(src_dir), &src_dir_u,
-                               inode_inum(dst_dir), &dst_dir_u,
-                               &src_inode_u,
-                               &dst_inode_u,
-                               &src_dentry->d_name,
-                               &dst_dentry->d_name,
-                               mode);
-       if (unlikely(ret))
-               goto err_tx_restart;
-
-       if (whiteout) {
-               whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
-               ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
-               if (unlikely(ret))
-                       goto err_tx_restart;
-               bch2_inode_init_early(c, whiteout_inode_u);
-
-               ret = bch2_create_trans(trans,
-                                       inode_inum(src_dir), &src_dir_u,
-                                       whiteout_inode_u,
-                                       &src_dentry->d_name,
-                                       from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
-                                       from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
-                                       S_IFCHR|WHITEOUT_MODE, 0,
-                                       NULL, NULL, (subvol_inum) { 0 }, 0) ?:
-                     bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
-                                     KEY_TYPE_QUOTA_PREALLOC);
-               if (unlikely(ret))
-                       goto err_tx_restart;
-       }
-
-       ret = bch2_trans_commit(trans, NULL, NULL, 0);
-       if (unlikely(ret)) {
-err_tx_restart:
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       goto retry;
-               goto err;
-       }
-
-       BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
-       BUG_ON(dst_inode &&
-              dst_inode->v.i_ino != dst_inode_u.bi_inum);
-
-       bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
-                                     ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-
-       if (src_dir != dst_dir)
-               bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
-                                             ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-
-       bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
-                                     ATTR_CTIME);
-
-       if (dst_inode)
-               bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
-                                             ATTR_CTIME);
-err:
-       bch2_trans_put(trans);
-
-       bch2_fs_quota_transfer(c, src_inode,
-                              bch_qid(&src_inode->ei_inode),
-                              1 << QTYP_PRJ,
-                              KEY_TYPE_QUOTA_NOCHECK);
-       if (dst_inode)
-               bch2_fs_quota_transfer(c, dst_inode,
-                                      bch_qid(&dst_inode->ei_inode),
-                                      1 << QTYP_PRJ,
-                                      KEY_TYPE_QUOTA_NOCHECK);
-
-       bch2_unlock_inodes(INODE_UPDATE_LOCK,
-                          src_dir,
-                          dst_dir,
-                          src_inode,
-                          dst_inode);
-
-       return bch2_err_class(ret);
-}
-
-static void bch2_setattr_copy(struct mnt_idmap *idmap,
-                             struct bch_inode_info *inode,
-                             struct bch_inode_unpacked *bi,
-                             struct iattr *attr)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       unsigned int ia_valid = attr->ia_valid;
-       kuid_t kuid;
-       kgid_t kgid;
-
-       if (ia_valid & ATTR_UID) {
-               kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
-               bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
-       }
-       if (ia_valid & ATTR_GID) {
-               kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
-               bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
-       }
-
-       if (ia_valid & ATTR_SIZE)
-               bi->bi_size = attr->ia_size;
-
-       if (ia_valid & ATTR_ATIME)
-               bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
-       if (ia_valid & ATTR_MTIME)
-               bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
-       if (ia_valid & ATTR_CTIME)
-               bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
-
-       if (ia_valid & ATTR_MODE) {
-               umode_t mode = attr->ia_mode;
-               kgid_t gid = ia_valid & ATTR_GID
-                       ? kgid
-                       : inode->v.i_gid;
-
-               if (!in_group_or_capable(idmap, &inode->v,
-                       make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
-                       mode &= ~S_ISGID;
-               bi->bi_mode = mode;
-       }
-}
-
-int bch2_setattr_nonsize(struct mnt_idmap *idmap,
-                        struct bch_inode_info *inode,
-                        struct iattr *attr)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_qid qid;
-       struct btree_trans *trans;
-       struct btree_iter inode_iter = {};
-       struct bch_inode_unpacked inode_u;
-       struct posix_acl *acl = NULL;
-       kuid_t kuid;
-       kgid_t kgid;
-       int ret;
-
-       mutex_lock(&inode->ei_update_lock);
-
-       qid = inode->ei_qid;
-
-       if (attr->ia_valid & ATTR_UID) {
-               kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
-               qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
-       }
-
-       if (attr->ia_valid & ATTR_GID) {
-               kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
-               qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
-       }
-
-       ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
-                                    KEY_TYPE_QUOTA_PREALLOC);
-       if (ret)
-               goto err;
-
-       trans = bch2_trans_get(c);
-retry:
-       bch2_trans_begin(trans);
-       kfree(acl);
-       acl = NULL;
-
-       ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
-                             BTREE_ITER_intent);
-       if (ret)
-               goto btree_err;
-
-       bch2_setattr_copy(idmap, inode, &inode_u, attr);
-
-       if (attr->ia_valid & ATTR_MODE) {
-               ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
-                                    inode_u.bi_mode, &acl);
-               if (ret)
-                       goto btree_err;
-       }
-
-       ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
-               bch2_trans_commit(trans, NULL, NULL,
-                                 BCH_TRANS_COMMIT_no_enospc);
-btree_err:
-       bch2_trans_iter_exit(trans, &inode_iter);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-       if (unlikely(ret))
-               goto err_trans;
-
-       bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
-
-       if (acl)
-               set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
-err_trans:
-       bch2_trans_put(trans);
-err:
-       mutex_unlock(&inode->ei_update_lock);
-
-       return bch2_err_class(ret);
-}
-
-static int bch2_getattr(struct mnt_idmap *idmap,
-                       const struct path *path, struct kstat *stat,
-                       u32 request_mask, unsigned query_flags)
-{
-       struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
-       vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
-
-       stat->dev       = inode->v.i_sb->s_dev;
-       stat->ino       = inode->v.i_ino;
-       stat->mode      = inode->v.i_mode;
-       stat->nlink     = inode->v.i_nlink;
-       stat->uid       = vfsuid_into_kuid(vfsuid);
-       stat->gid       = vfsgid_into_kgid(vfsgid);
-       stat->rdev      = inode->v.i_rdev;
-       stat->size      = i_size_read(&inode->v);
-       stat->atime     = inode_get_atime(&inode->v);
-       stat->mtime     = inode_get_mtime(&inode->v);
-       stat->ctime     = inode_get_ctime(&inode->v);
-       stat->blksize   = block_bytes(c);
-       stat->blocks    = inode->v.i_blocks;
-
-       stat->subvol    = inode->ei_inum.subvol;
-       stat->result_mask |= STATX_SUBVOL;
-
-       if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
-               stat->result_mask |= STATX_DIOALIGN;
-               /*
-                * this is incorrect; we should be tracking this in superblock,
-                * and checking the alignment of open devices
-                */
-               stat->dio_mem_align = SECTOR_SIZE;
-               stat->dio_offset_align = block_bytes(c);
-       }
-
-       if (request_mask & STATX_BTIME) {
-               stat->result_mask |= STATX_BTIME;
-               stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
-       }
-
-       if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
-               stat->attributes |= STATX_ATTR_IMMUTABLE;
-       stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
-
-       if (inode->ei_inode.bi_flags & BCH_INODE_append)
-               stat->attributes |= STATX_ATTR_APPEND;
-       stat->attributes_mask    |= STATX_ATTR_APPEND;
-
-       if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
-               stat->attributes |= STATX_ATTR_NODUMP;
-       stat->attributes_mask    |= STATX_ATTR_NODUMP;
-
-       return 0;
-}
-
-static int bch2_setattr(struct mnt_idmap *idmap,
-                       struct dentry *dentry, struct iattr *iattr)
-{
-       struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       int ret;
-
-       lockdep_assert_held(&inode->v.i_rwsem);
-
-       ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
-               setattr_prepare(idmap, dentry, iattr);
-       if (ret)
-               return ret;
-
-       return iattr->ia_valid & ATTR_SIZE
-               ? bchfs_truncate(idmap, inode, iattr)
-               : bch2_setattr_nonsize(idmap, inode, iattr);
-}
-
-static int bch2_tmpfile(struct mnt_idmap *idmap,
-                       struct inode *vdir, struct file *file, umode_t mode)
-{
-       struct bch_inode_info *inode =
-               __bch2_create(idmap, to_bch_ei(vdir),
-                             file->f_path.dentry, mode, 0,
-                             (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-
-       if (IS_ERR(inode))
-               return bch2_err_class(PTR_ERR(inode));
-
-       d_mark_tmpfile(file, &inode->v);
-       d_instantiate(file->f_path.dentry, &inode->v);
-       return finish_open_simple(file, 0);
-}
-
-struct bch_fiemap_extent {
-       struct bkey_buf kbuf;
-       unsigned        flags;
-};
-
-static int bch2_fill_extent(struct bch_fs *c,
-                           struct fiemap_extent_info *info,
-                           struct bch_fiemap_extent *fe)
-{
-       struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k);
-       unsigned flags = fe->flags;
-
-       BUG_ON(!k.k->size);
-
-       if (bkey_extent_is_direct_data(k.k)) {
-               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-               const union bch_extent_entry *entry;
-               struct extent_ptr_decoded p;
-               int ret;
-
-               if (k.k->type == KEY_TYPE_reflink_v)
-                       flags |= FIEMAP_EXTENT_SHARED;
-
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-                       int flags2 = 0;
-                       u64 offset = p.ptr.offset;
-
-                       if (p.ptr.unwritten)
-                               flags2 |= FIEMAP_EXTENT_UNWRITTEN;
-
-                       if (p.crc.compression_type)
-                               flags2 |= FIEMAP_EXTENT_ENCODED;
-                       else
-                               offset += p.crc.offset;
-
-                       if ((offset & (block_sectors(c) - 1)) ||
-                           (k.k->size & (block_sectors(c) - 1)))
-                               flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
-
-                       ret = fiemap_fill_next_extent(info,
-                                               bkey_start_offset(k.k) << 9,
-                                               offset << 9,
-                                               k.k->size << 9, flags|flags2);
-                       if (ret)
-                               return ret;
-               }
-
-               return 0;
-       } else if (bkey_extent_is_inline_data(k.k)) {
-               return fiemap_fill_next_extent(info,
-                                              bkey_start_offset(k.k) << 9,
-                                              0, k.k->size << 9,
-                                              flags|
-                                              FIEMAP_EXTENT_DATA_INLINE);
-       } else if (k.k->type == KEY_TYPE_reservation) {
-               return fiemap_fill_next_extent(info,
-                                              bkey_start_offset(k.k) << 9,
-                                              0, k.k->size << 9,
-                                              flags|
-                                              FIEMAP_EXTENT_DELALLOC|
-                                              FIEMAP_EXTENT_UNWRITTEN);
-       } else {
-               BUG();
-       }
-}
-
-/*
- * Scan a range of an inode for data in pagecache.
- *
- * Intended to be retryable, so don't modify the output params until success is
- * imminent.
- */
-static int
-bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
-                          bool nonblock)
-{
-       loff_t  dstart, dend;
-
-       dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock);
-       if (dstart < 0)
-               return dstart;
-
-       if (dstart == *end) {
-               *start = dstart;
-               return 0;
-       }
-
-       dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock);
-       if (dend < 0)
-               return dend;
-
-       /* race */
-       BUG_ON(dstart == dend);
-
-       *start = dstart;
-       *end = dend;
-       return 0;
-}
-
-/*
- * Scan a range of pagecache that corresponds to a file mapping hole in the
- * extent btree. If data is found, fake up an extent key so it looks like a
- * delalloc extent to the rest of the fiemap processing code.
- */
-static int
-bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode,
-                                 u64 start, u64 end, struct bch_fiemap_extent *cur)
-{
-       struct bch_fs           *c = trans->c;
-       struct bkey_i_extent    *delextent;
-       struct bch_extent_ptr   ptr = {};
-       loff_t                  dstart = start << 9, dend = end << 9;
-       int                     ret;
-
-       /*
-        * We hold btree locks here so we cannot block on folio locks without
-        * dropping trans locks first. Run a nonblocking scan for the common
-        * case of no folios over holes and fall back on failure.
-        *
-        * Note that dropping locks like this is technically racy against
-        * writeback inserting to the extent tree, but a non-sync fiemap scan is
-        * fundamentally racy with writeback anyways. Therefore, just report the
-        * range as delalloc regardless of whether we have to cycle trans locks.
-        */
-       ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true);
-       if (ret == -EAGAIN)
-               ret = drop_locks_do(trans,
-                       bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false));
-       if (ret < 0)
-               return ret;
-
-       /*
-        * Create a fake extent key in the buffer. We have to add a dummy extent
-        * pointer for the fill code to add an extent entry. It's explicitly
-        * zeroed to reflect delayed allocation (i.e. phys offset 0).
-        */
-       bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
-       delextent = bkey_extent_init(cur->kbuf.k);
-       delextent->k.p = POS(inode->ei_inum.inum, dend >> 9);
-       delextent->k.size = (dend - dstart) >> 9;
-       bch2_bkey_append_ptr(&delextent->k_i, ptr);
-
-       cur->flags = FIEMAP_EXTENT_DELALLOC;
-
-       return 0;
-}
-
-static int bch2_next_fiemap_extent(struct btree_trans *trans,
-                                  struct bch_inode_info *inode,
-                                  u64 start, u64 end,
-                                  struct bch_fiemap_extent *cur)
-{
-       u32 snapshot;
-       int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
-       if (ret)
-               return ret;
-
-       struct btree_iter iter;
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            SPOS(inode->ei_inum.inum, start, snapshot), 0);
-
-       struct bkey_s_c k =
-               bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end));
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end;
-
-       ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur);
-       if (ret)
-               goto err;
-
-       struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k);
-
-       /*
-        * Does the pagecache or the btree take precedence?
-        *
-        * It _should_ be the pagecache, so that we correctly report delalloc
-        * extents when dirty in the pagecache (we're COW, after all).
-        *
-        * But we'd have to add per-sector writeback tracking to
-        * bch_folio_state, otherwise we report delalloc extents for clean
-        * cached data in the pagecache.
-        *
-        * We should do this, but even then fiemap won't report stable mappings:
-        * on bcachefs data moves around in the background (copygc, rebalance)
-        * and we don't provide a way for userspace to lock that out.
-        */
-       if (k.k &&
-           bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)),
-                   pagecache_start)) {
-               bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k);
-               bch2_cut_front(iter.pos, cur->kbuf.k);
-               bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k);
-               cur->flags = 0;
-       } else if (k.k) {
-               bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k);
-       }
-
-       if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) {
-               unsigned sectors = cur->kbuf.k->k.size;
-               s64 offset_into_extent = 0;
-               enum btree_id data_btree = BTREE_ID_extents;
-               ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent,
-                                               &cur->kbuf);
-               if (ret)
-                       goto err;
-
-               struct bkey_i *k = cur->kbuf.k;
-               sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent);
-
-               bch2_cut_front(POS(k->k.p.inode,
-                                  bkey_start_offset(&k->k) + offset_into_extent),
-                              k);
-               bch2_key_resize(&k->k, sectors);
-               k->k.p = iter.pos;
-               k->k.p.offset += k->k.size;
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
-                      u64 start, u64 len)
-{
-       struct bch_fs *c = vinode->i_sb->s_fs_info;
-       struct bch_inode_info *ei = to_bch_ei(vinode);
-       struct btree_trans *trans;
-       struct bch_fiemap_extent cur, prev;
-       int ret = 0;
-
-       ret = fiemap_prep(&ei->v, info, start, &len, 0);
-       if (ret)
-               return ret;
-
-       if (start + len < start)
-               return -EINVAL;
-
-       start >>= 9;
-       u64 end = (start + len) >> 9;
-
-       bch2_bkey_buf_init(&cur.kbuf);
-       bch2_bkey_buf_init(&prev.kbuf);
-       bkey_init(&prev.kbuf.k->k);
-
-       trans = bch2_trans_get(c);
-
-       while (start < end) {
-               ret = lockrestart_do(trans,
-                       bch2_next_fiemap_extent(trans, ei, start, end, &cur));
-               if (ret)
-                       goto err;
-
-               BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start);
-               BUG_ON(cur.kbuf.k->k.p.offset > end);
-
-               if (bkey_start_offset(&cur.kbuf.k->k) == end)
-                       break;
-
-               start = cur.kbuf.k->k.p.offset;
-
-               if (!bkey_deleted(&prev.kbuf.k->k)) {
-                       bch2_trans_unlock(trans);
-                       ret = bch2_fill_extent(c, info, &prev);
-                       if (ret)
-                               goto err;
-               }
-
-               bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k);
-               prev.flags = cur.flags;
-       }
-
-       if (!bkey_deleted(&prev.kbuf.k->k)) {
-               bch2_trans_unlock(trans);
-               prev.flags |= FIEMAP_EXTENT_LAST;
-               ret = bch2_fill_extent(c, info, &prev);
-       }
-err:
-       bch2_trans_put(trans);
-       bch2_bkey_buf_exit(&cur.kbuf, c);
-       bch2_bkey_buf_exit(&prev.kbuf, c);
-
-       return bch2_err_class(ret < 0 ? ret : 0);
-}
-
-static const struct vm_operations_struct bch_vm_ops = {
-       .fault          = bch2_page_fault,
-       .map_pages      = filemap_map_pages,
-       .page_mkwrite   = bch2_page_mkwrite,
-};
-
-static int bch2_mmap_prepare(struct vm_area_desc *desc)
-{
-       file_accessed(desc->file);
-
-       desc->vm_ops = &bch_vm_ops;
-       return 0;
-}
-
-/* Directories: */
-
-static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-       return generic_file_llseek_size(file, offset, whence,
-                                       S64_MAX, S64_MAX);
-}
-
-static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
-{
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
-
-       if (!dir_emit_dots(file, ctx))
-               return 0;
-
-       int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx);
-
-       bch_err_fn(c, ret);
-       return bch2_err_class(ret);
-}
-
-static int bch2_open(struct inode *vinode, struct file *file)
-{
-       if (file->f_flags & (O_WRONLY|O_RDWR)) {
-               struct bch_inode_info *inode = to_bch_ei(vinode);
-               struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-               int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
-               if (ret)
-                       return ret;
-       }
-
-       file->f_mode |= FMODE_CAN_ODIRECT;
-
-       return generic_file_open(vinode, file);
-}
-
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const __maybe_unused unsigned bch_flags_to_uflags[] = {
-       [__BCH_INODE_sync]              = FS_SYNC_FL,
-       [__BCH_INODE_immutable]         = FS_IMMUTABLE_FL,
-       [__BCH_INODE_append]            = FS_APPEND_FL,
-       [__BCH_INODE_nodump]            = FS_NODUMP_FL,
-       [__BCH_INODE_noatime]           = FS_NOATIME_FL,
-};
-
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const __maybe_unused unsigned bch_flags_to_xflags[] = {
-       [__BCH_INODE_sync]      = FS_XFLAG_SYNC,
-       [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
-       [__BCH_INODE_append]    = FS_XFLAG_APPEND,
-       [__BCH_INODE_nodump]    = FS_XFLAG_NODUMP,
-       [__BCH_INODE_noatime]   = FS_XFLAG_NOATIME,
-};
-
-static int bch2_fileattr_get(struct dentry *dentry,
-                            struct file_kattr *fa)
-{
-       struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-       fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags));
-
-       if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
-               fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
-
-       if (bch2_inode_casefold(c, &inode->ei_inode))
-               fa->flags |= FS_CASEFOLD_FL;
-
-       fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ];
-       return 0;
-}
-
-struct flags_set {
-       unsigned                mask;
-       unsigned                flags;
-       unsigned                projid;
-       bool                    set_project;
-       bool                    set_casefold;
-       bool                    casefold;
-};
-
-static int fssetxattr_inode_update_fn(struct btree_trans *trans,
-                                     struct bch_inode_info *inode,
-                                     struct bch_inode_unpacked *bi,
-                                     void *p)
-{
-       struct bch_fs *c = trans->c;
-       struct flags_set *s = p;
-
-       /*
-        * We're relying on btree locking here for exclusion with other ioctl
-        * calls - use the flags in the btree (@bi), not inode->i_flags:
-        */
-       if (!S_ISREG(bi->bi_mode) &&
-           !S_ISDIR(bi->bi_mode) &&
-           (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags)
-               return -EINVAL;
-
-       if (s->casefold != bch2_inode_casefold(c, bi)) {
-               int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold);
-               if (ret)
-                       return ret;
-       }
-
-       if (s->set_project) {
-               bi->bi_project = s->projid;
-               bi->bi_fields_set |= BIT(Inode_opt_project);
-       }
-
-       bi->bi_flags &= ~s->mask;
-       bi->bi_flags |= s->flags;
-
-       bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
-       return 0;
-}
-
-static int bch2_fileattr_set(struct mnt_idmap *idmap,
-                            struct dentry *dentry,
-                            struct file_kattr *fa)
-{
-       struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct flags_set s = {};
-       int ret;
-
-       if (fa->fsx_valid) {
-               fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
-
-               s.mask = map_defined(bch_flags_to_xflags);
-               s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
-               if (fa->fsx_xflags)
-                       return -EOPNOTSUPP;
-
-               if (fa->fsx_projid >= U32_MAX)
-                       return -EINVAL;
-
-               /*
-                * inode fields accessible via the xattr interface are stored with a +1
-                * bias, so that 0 means unset:
-                */
-               if ((inode->ei_inode.bi_project ||
-                    fa->fsx_projid) &&
-                   inode->ei_inode.bi_project != fa->fsx_projid + 1) {
-                       s.projid = fa->fsx_projid + 1;
-                       s.set_project = true;
-               }
-       }
-
-       if (fa->flags_valid) {
-               s.mask = map_defined(bch_flags_to_uflags);
-
-               s.set_casefold = true;
-               s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0;
-               fa->flags &= ~FS_CASEFOLD_FL;
-
-               s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
-               if (fa->flags)
-                       return -EOPNOTSUPP;
-       }
-
-       mutex_lock(&inode->ei_update_lock);
-       ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
-               (s.set_project
-                ? bch2_set_projid(c, inode, fa->fsx_projid)
-                : 0) ?:
-               bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
-                              ATTR_CTIME);
-       mutex_unlock(&inode->ei_update_lock);
-
-       return bch2_err_class(ret);
-}
-
-static const struct file_operations bch_file_operations = {
-       .open           = bch2_open,
-       .llseek         = bch2_llseek,
-       .read_iter      = bch2_read_iter,
-       .write_iter     = bch2_write_iter,
-       .mmap_prepare   = bch2_mmap_prepare,
-       .get_unmapped_area = thp_get_unmapped_area,
-       .fsync          = bch2_fsync,
-       .splice_read    = filemap_splice_read,
-       .splice_write   = iter_file_splice_write,
-       .fallocate      = bch2_fallocate_dispatch,
-       .unlocked_ioctl = bch2_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
-       .compat_ioctl   = bch2_compat_fs_ioctl,
-#endif
-       .remap_file_range = bch2_remap_file_range,
-};
-
-static const struct inode_operations bch_file_inode_operations = {
-       .getattr        = bch2_getattr,
-       .setattr        = bch2_setattr,
-       .fiemap         = bch2_fiemap,
-       .listxattr      = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-       .get_inode_acl  = bch2_get_acl,
-       .set_acl        = bch2_set_acl,
-#endif
-       .fileattr_get   = bch2_fileattr_get,
-       .fileattr_set   = bch2_fileattr_set,
-};
-
-static const struct inode_operations bch_dir_inode_operations = {
-       .lookup         = bch2_lookup,
-       .create         = bch2_create,
-       .link           = bch2_link,
-       .unlink         = bch2_unlink,
-       .symlink        = bch2_symlink,
-       .mkdir          = bch2_mkdir,
-       .rmdir          = bch2_unlink,
-       .mknod          = bch2_mknod,
-       .rename         = bch2_rename2,
-       .getattr        = bch2_getattr,
-       .setattr        = bch2_setattr,
-       .tmpfile        = bch2_tmpfile,
-       .listxattr      = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-       .get_inode_acl  = bch2_get_acl,
-       .set_acl        = bch2_set_acl,
-#endif
-       .fileattr_get   = bch2_fileattr_get,
-       .fileattr_set   = bch2_fileattr_set,
-};
-
-static const struct file_operations bch_dir_file_operations = {
-       .llseek         = bch2_dir_llseek,
-       .read           = generic_read_dir,
-       .iterate_shared = bch2_vfs_readdir,
-       .fsync          = bch2_fsync,
-       .unlocked_ioctl = bch2_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
-       .compat_ioctl   = bch2_compat_fs_ioctl,
-#endif
-};
-
-static const struct inode_operations bch_symlink_inode_operations = {
-       .get_link       = page_get_link,
-       .getattr        = bch2_getattr,
-       .setattr        = bch2_setattr,
-       .listxattr      = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-       .get_inode_acl  = bch2_get_acl,
-       .set_acl        = bch2_set_acl,
-#endif
-       .fileattr_get   = bch2_fileattr_get,
-       .fileattr_set   = bch2_fileattr_set,
-};
-
-static const struct inode_operations bch_special_inode_operations = {
-       .getattr        = bch2_getattr,
-       .setattr        = bch2_setattr,
-       .listxattr      = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-       .get_inode_acl  = bch2_get_acl,
-       .set_acl        = bch2_set_acl,
-#endif
-       .fileattr_get   = bch2_fileattr_get,
-       .fileattr_set   = bch2_fileattr_set,
-};
-
-static const struct address_space_operations bch_address_space_operations = {
-       .read_folio     = bch2_read_folio,
-       .writepages     = bch2_writepages,
-       .readahead      = bch2_readahead,
-       .dirty_folio    = filemap_dirty_folio,
-       .write_begin    = bch2_write_begin,
-       .write_end      = bch2_write_end,
-       .invalidate_folio = bch2_invalidate_folio,
-       .release_folio  = bch2_release_folio,
-#ifdef CONFIG_MIGRATION
-       .migrate_folio  = filemap_migrate_folio,
-#endif
-       .error_remove_folio = generic_error_remove_folio,
-};
-
-struct bcachefs_fid {
-       u64             inum;
-       u32             subvol;
-       u32             gen;
-} __packed;
-
-struct bcachefs_fid_with_parent {
-       struct bcachefs_fid     fid;
-       struct bcachefs_fid     dir;
-} __packed;
-
-static int bcachefs_fid_valid(int fh_len, int fh_type)
-{
-       switch (fh_type) {
-       case FILEID_BCACHEFS_WITHOUT_PARENT:
-               return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
-       case FILEID_BCACHEFS_WITH_PARENT:
-               return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
-       default:
-               return false;
-       }
-}
-
-static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
-{
-       return (struct bcachefs_fid) {
-               .inum   = inode->ei_inum.inum,
-               .subvol = inode->ei_inum.subvol,
-               .gen    = inode->ei_inode.bi_generation,
-       };
-}
-
-static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
-                         struct inode *vdir)
-{
-       struct bch_inode_info *inode    = to_bch_ei(vinode);
-       struct bch_inode_info *dir      = to_bch_ei(vdir);
-       int min_len;
-
-       if (!S_ISDIR(inode->v.i_mode) && dir) {
-               struct bcachefs_fid_with_parent *fid = (void *) fh;
-
-               min_len = sizeof(*fid) / sizeof(u32);
-               if (*len < min_len) {
-                       *len = min_len;
-                       return FILEID_INVALID;
-               }
-
-               fid->fid = bch2_inode_to_fid(inode);
-               fid->dir = bch2_inode_to_fid(dir);
-
-               *len = min_len;
-               return FILEID_BCACHEFS_WITH_PARENT;
-       } else {
-               struct bcachefs_fid *fid = (void *) fh;
-
-               min_len = sizeof(*fid) / sizeof(u32);
-               if (*len < min_len) {
-                       *len = min_len;
-                       return FILEID_INVALID;
-               }
-               *fid = bch2_inode_to_fid(inode);
-
-               *len = min_len;
-               return FILEID_BCACHEFS_WITHOUT_PARENT;
-       }
-}
-
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
-                                       struct bcachefs_fid fid)
-{
-       struct bch_fs *c = sb->s_fs_info;
-       struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
-                                   .subvol = fid.subvol,
-                                   .inum = fid.inum,
-       });
-       if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
-               iput(vinode);
-               vinode = ERR_PTR(-ESTALE);
-       }
-       return vinode;
-}
-
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
-               int fh_len, int fh_type)
-{
-       struct bcachefs_fid *fid = (void *) _fid;
-
-       if (!bcachefs_fid_valid(fh_len, fh_type))
-               return NULL;
-
-       return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
-}
-
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
-               int fh_len, int fh_type)
-{
-       struct bcachefs_fid_with_parent *fid = (void *) _fid;
-
-       if (!bcachefs_fid_valid(fh_len, fh_type) ||
-           fh_type != FILEID_BCACHEFS_WITH_PARENT)
-               return NULL;
-
-       return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
-}
-
-static struct dentry *bch2_get_parent(struct dentry *child)
-{
-       struct bch_inode_info *inode = to_bch_ei(child->d_inode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       subvol_inum parent_inum = {
-               .subvol = inode->ei_inode.bi_parent_subvol ?:
-                       inode->ei_inum.subvol,
-               .inum = inode->ei_inode.bi_dir,
-       };
-
-       return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
-}
-
-static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
-{
-       struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
-       struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans *trans;
-       struct btree_iter iter1;
-       struct btree_iter iter2;
-       struct bkey_s_c k;
-       struct bkey_s_c_dirent d;
-       struct bch_inode_unpacked inode_u;
-       subvol_inum target;
-       u32 snapshot;
-       struct qstr dirent_name;
-       unsigned name_len = 0;
-       int ret;
-
-       if (!S_ISDIR(dir->v.i_mode))
-               return -EINVAL;
-
-       trans = bch2_trans_get(c);
-
-       bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
-                            POS(dir->ei_inode.bi_inum, 0), 0);
-       bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
-                            POS(dir->ei_inode.bi_inum, 0), 0);
-retry:
-       bch2_trans_begin(trans);
-
-       ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
-       if (ret)
-               goto err;
-
-       bch2_btree_iter_set_snapshot(trans, &iter1, snapshot);
-       bch2_btree_iter_set_snapshot(trans, &iter2, snapshot);
-
-       ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
-       if (ret)
-               goto err;
-
-       if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
-               bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
-
-               k = bch2_btree_iter_peek_slot(trans, &iter1);
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               if (k.k->type != KEY_TYPE_dirent) {
-                       ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
-                       goto err;
-               }
-
-               d = bkey_s_c_to_dirent(k);
-               ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
-               if (ret > 0)
-                       ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
-               if (ret)
-                       goto err;
-
-               if (subvol_inum_eq(target, inode->ei_inum))
-                       goto found;
-       } else {
-               /*
-                * File with multiple hardlinks and our backref is to the wrong
-                * directory - linear search:
-                */
-               for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) {
-                       if (k.k->p.inode > dir->ei_inode.bi_inum)
-                               break;
-
-                       if (k.k->type != KEY_TYPE_dirent)
-                               continue;
-
-                       d = bkey_s_c_to_dirent(k);
-                       ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
-                       if (ret < 0)
-                               break;
-                       if (ret)
-                               continue;
-
-                       if (subvol_inum_eq(target, inode->ei_inum))
-                               goto found;
-               }
-       }
-
-       ret = -ENOENT;
-       goto err;
-found:
-       dirent_name = bch2_dirent_get_name(d);
-
-       name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
-       memcpy(name, dirent_name.name, name_len);
-       name[name_len] = '\0';
-err:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-
-       bch2_trans_iter_exit(trans, &iter1);
-       bch2_trans_iter_exit(trans, &iter2);
-       bch2_trans_put(trans);
-
-       return ret;
-}
-
-static const struct export_operations bch_export_ops = {
-       .encode_fh      = bch2_encode_fh,
-       .fh_to_dentry   = bch2_fh_to_dentry,
-       .fh_to_parent   = bch2_fh_to_parent,
-       .get_parent     = bch2_get_parent,
-       .get_name       = bch2_get_name,
-};
-
-static void bch2_vfs_inode_init(struct btree_trans *trans,
-                               subvol_inum inum,
-                               struct bch_inode_info *inode,
-                               struct bch_inode_unpacked *bi,
-                               struct bch_subvolume *subvol)
-{
-       inode->v.i_ino          = inum.inum;
-       inode->ei_inum          = inum;
-       inode->ei_inode.bi_inum = inum.inum;
-       bch2_inode_update_after_write(trans, inode, bi, ~0);
-
-       inode->v.i_blocks       = bi->bi_sectors;
-       inode->v.i_rdev         = bi->bi_dev;
-       inode->v.i_generation   = bi->bi_generation;
-       inode->v.i_size         = bi->bi_size;
-
-       inode->ei_flags         = 0;
-       inode->ei_quota_reserved = 0;
-       inode->ei_qid           = bch_qid(bi);
-
-       if (BCH_SUBVOLUME_SNAP(subvol))
-               set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
-
-       inode->v.i_mapping->a_ops = &bch_address_space_operations;
-
-       switch (inode->v.i_mode & S_IFMT) {
-       case S_IFREG:
-               inode->v.i_op   = &bch_file_inode_operations;
-               inode->v.i_fop  = &bch_file_operations;
-               break;
-       case S_IFDIR:
-               inode->v.i_op   = &bch_dir_inode_operations;
-               inode->v.i_fop  = &bch_dir_file_operations;
-               break;
-       case S_IFLNK:
-               inode_nohighmem(&inode->v);
-               inode->v.i_op   = &bch_symlink_inode_operations;
-               break;
-       default:
-               init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
-               inode->v.i_op   = &bch_special_inode_operations;
-               break;
-       }
-
-       mapping_set_folio_min_order(inode->v.i_mapping,
-                                   get_order(trans->c->opts.block_size));
-}
-
-static void bch2_free_inode(struct inode *vinode)
-{
-       kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
-}
-
-static int inode_update_times_fn(struct btree_trans *trans,
-                                struct bch_inode_info *inode,
-                                struct bch_inode_unpacked *bi,
-                                void *p)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-       bi->bi_atime    = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
-       bi->bi_mtime    = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
-       bi->bi_ctime    = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
-
-       return 0;
-}
-
-static int bch2_vfs_write_inode(struct inode *vinode,
-                               struct writeback_control *wbc)
-{
-       struct bch_fs *c = vinode->i_sb->s_fs_info;
-       struct bch_inode_info *inode = to_bch_ei(vinode);
-       int ret;
-
-       mutex_lock(&inode->ei_update_lock);
-       ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-                              ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
-       mutex_unlock(&inode->ei_update_lock);
-
-       return bch2_err_class(ret);
-}
-
-static void bch2_evict_inode(struct inode *vinode)
-{
-       struct bch_fs *c = vinode->i_sb->s_fs_info;
-       struct bch_inode_info *inode = to_bch_ei(vinode);
-       bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
-
-       /*
-        * evict() has waited for outstanding writeback, we'll do no more IO
-        * through this inode: it's safe to remove from VFS inode hashtable here
-        *
-        * Do that now so that other threads aren't blocked from pulling it back
-        * in, there's no reason for them to be:
-        */
-       if (!delete)
-               bch2_inode_hash_remove(c, inode);
-
-       truncate_inode_pages_final(&inode->v.i_data);
-
-       clear_inode(&inode->v);
-
-       BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
-
-       if (delete) {
-               bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
-                               KEY_TYPE_QUOTA_WARN);
-               bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
-                               KEY_TYPE_QUOTA_WARN);
-               int ret = bch2_inode_rm(c, inode_inum(inode));
-               if (ret && !bch2_err_matches(ret, EROFS)) {
-                       bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu",
-                                   inode->ei_inum.subvol,
-                                   inode->ei_inum.inum);
-                       bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm);
-               }
-
-               /*
-                * If we are deleting, we need it present in the vfs hash table
-                * so that fsck can check if unlinked inodes are still open:
-                */
-               bch2_inode_hash_remove(c, inode);
-       }
-
-       mutex_lock(&c->vfs_inodes_lock);
-       list_del_init(&inode->ei_vfs_inode_list);
-       mutex_unlock(&c->vfs_inodes_lock);
-}
-
-void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
-{
-       struct bch_inode_info *inode;
-       DARRAY(struct bch_inode_info *) grabbed;
-       bool clean_pass = false, this_pass_clean;
-
-       /*
-        * Initially, we scan for inodes without I_DONTCACHE, then mark them to
-        * be pruned with d_mark_dontcache().
-        *
-        * Once we've had a clean pass where we didn't find any inodes without
-        * I_DONTCACHE, we wait for them to be freed:
-        */
-
-       darray_init(&grabbed);
-       darray_make_room(&grabbed, 1024);
-again:
-       cond_resched();
-       this_pass_clean = true;
-
-       mutex_lock(&c->vfs_inodes_lock);
-       list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
-               if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
-                       continue;
-
-               if (!(inode->v.i_state & I_DONTCACHE) &&
-                   !(inode->v.i_state & I_FREEING) &&
-                   igrab(&inode->v)) {
-                       this_pass_clean = false;
-
-                       if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
-                               iput(&inode->v);
-                               break;
-                       }
-               } else if (clean_pass && this_pass_clean) {
-                       struct wait_bit_queue_entry wqe;
-                       struct wait_queue_head *wq_head;
-
-                       wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
-                       prepare_to_wait_event(wq_head, &wqe.wq_entry,
-                                             TASK_UNINTERRUPTIBLE);
-                       mutex_unlock(&c->vfs_inodes_lock);
-
-                       schedule();
-                       finish_wait(wq_head, &wqe.wq_entry);
-                       goto again;
-               }
-       }
-       mutex_unlock(&c->vfs_inodes_lock);
-
-       darray_for_each(grabbed, i) {
-               inode = *i;
-               d_mark_dontcache(&inode->v);
-               d_prune_aliases(&inode->v);
-               iput(&inode->v);
-       }
-       grabbed.nr = 0;
-
-       if (!clean_pass || !this_pass_clean) {
-               clean_pass = this_pass_clean;
-               goto again;
-       }
-
-       darray_exit(&grabbed);
-}
-
-static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-       struct super_block *sb = dentry->d_sb;
-       struct bch_fs *c = sb->s_fs_info;
-       struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
-       unsigned shift = sb->s_blocksize_bits - 9;
-       /*
-        * this assumes inodes take up 64 bytes, which is a decent average
-        * number:
-        */
-       u64 avail_inodes = ((usage.capacity - usage.used) << 3);
-
-       buf->f_type     = BCACHEFS_STATFS_MAGIC;
-       buf->f_bsize    = sb->s_blocksize;
-       buf->f_blocks   = usage.capacity >> shift;
-       buf->f_bfree    = usage.free >> shift;
-       buf->f_bavail   = avail_factor(usage.free) >> shift;
-
-       buf->f_files    = usage.nr_inodes + avail_inodes;
-       buf->f_ffree    = avail_inodes;
-
-       buf->f_fsid     = uuid_to_fsid(c->sb.user_uuid.b);
-       buf->f_namelen  = BCH_NAME_MAX;
-
-       return 0;
-}
-
-static int bch2_sync_fs(struct super_block *sb, int wait)
-{
-       struct bch_fs *c = sb->s_fs_info;
-       int ret;
-
-       trace_bch2_sync_fs(sb, wait);
-
-       if (c->opts.journal_flush_disabled)
-               return 0;
-
-       if (!wait) {
-               bch2_journal_flush_async(&c->journal, NULL);
-               return 0;
-       }
-
-       ret = bch2_journal_flush(&c->journal);
-       return bch2_err_class(ret);
-}
-
-static struct bch_fs *bch2_path_to_fs(const char *path)
-{
-       struct bch_fs *c;
-       dev_t dev;
-       int ret;
-
-       ret = lookup_bdev(path, &dev);
-       if (ret)
-               return ERR_PTR(ret);
-
-       c = bch2_dev_to_fs(dev);
-       if (c)
-               closure_put(&c->cl);
-       return c ?: ERR_PTR(-ENOENT);
-}
-
-static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
-{
-       struct bch_fs *c = root->d_sb->s_fs_info;
-       bool first = true;
-
-       guard(rcu)();
-       for_each_online_member_rcu(c, ca) {
-               if (!first)
-                       seq_putc(seq, ':');
-               first = false;
-               seq_puts(seq, ca->disk_sb.sb_name);
-       }
-
-       return 0;
-}
-
-static int bch2_show_options(struct seq_file *seq, struct dentry *root)
-{
-       struct bch_fs *c = root->d_sb->s_fs_info;
-       struct printbuf buf = PRINTBUF;
-
-       bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
-                         OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
-       printbuf_nul_terminate(&buf);
-       seq_printf(seq, ",%s", buf.buf);
-
-       int ret = buf.allocation_failure ? -ENOMEM : 0;
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static void bch2_put_super(struct super_block *sb)
-{
-       struct bch_fs *c = sb->s_fs_info;
-
-       __bch2_fs_stop(c);
-}
-
-/*
- * bcachefs doesn't currently integrate intwrite freeze protection but the
- * internal write references serve the same purpose. Therefore reuse the
- * read-only transition code to perform the quiesce. The caveat is that we don't
- * currently have the ability to block tasks that want a write reference while
- * the superblock is frozen. This is fine for now, but we should either add
- * blocking support or find a way to integrate sb_start_intwrite() and friends.
- */
-static int bch2_freeze(struct super_block *sb)
-{
-       struct bch_fs *c = sb->s_fs_info;
-
-       down_write(&c->state_lock);
-       bch2_fs_read_only(c);
-       up_write(&c->state_lock);
-       return 0;
-}
-
-static int bch2_unfreeze(struct super_block *sb)
-{
-       struct bch_fs *c = sb->s_fs_info;
-       int ret;
-
-       if (test_bit(BCH_FS_emergency_ro, &c->flags))
-               return 0;
-
-       down_write(&c->state_lock);
-       ret = bch2_fs_read_write(c);
-       up_write(&c->state_lock);
-       return ret;
-}
-
-static const struct super_operations bch_super_operations = {
-       .alloc_inode    = bch2_alloc_inode,
-       .free_inode     = bch2_free_inode,
-       .write_inode    = bch2_vfs_write_inode,
-       .evict_inode    = bch2_evict_inode,
-       .sync_fs        = bch2_sync_fs,
-       .statfs         = bch2_statfs,
-       .show_devname   = bch2_show_devname,
-       .show_options   = bch2_show_options,
-       .put_super      = bch2_put_super,
-       .freeze_fs      = bch2_freeze,
-       .unfreeze_fs    = bch2_unfreeze,
-};
-
-static int bch2_set_super(struct super_block *s, void *data)
-{
-       s->s_fs_info = data;
-       return 0;
-}
-
-static int bch2_noset_super(struct super_block *s, void *data)
-{
-       return -EBUSY;
-}
-
-typedef DARRAY(struct bch_fs *) darray_fs;
-
-static int bch2_test_super(struct super_block *s, void *data)
-{
-       struct bch_fs *c = s->s_fs_info;
-       darray_fs *d = data;
-
-       if (!c)
-               return false;
-
-       darray_for_each(*d, i)
-               if (c != *i)
-                       return false;
-       return true;
-}
-
-static int bch2_fs_get_tree(struct fs_context *fc)
-{
-       struct bch_fs *c;
-       struct super_block *sb;
-       struct inode *vinode;
-       struct bch2_opts_parse *opts_parse = fc->fs_private;
-       struct bch_opts opts = opts_parse->opts;
-       darray_const_str devs;
-       darray_fs devs_to_fs = {};
-       int ret;
-
-       opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
-       opt_set(opts, nostart, true);
-
-       if (!fc->source || strlen(fc->source) == 0)
-               return -EINVAL;
-
-       ret = bch2_split_devs(fc->source, &devs);
-       if (ret)
-               return ret;
-
-       darray_for_each(devs, i) {
-               ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
-               if (ret)
-                       goto err;
-       }
-
-       sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
-       if (!IS_ERR(sb))
-               goto got_sb;
-
-       c = bch2_fs_open(&devs, &opts);
-       ret = PTR_ERR_OR_ZERO(c);
-       if (ret)
-               goto err;
-
-       if (opt_defined(opts, discard))
-               set_bit(BCH_FS_discard_mount_opt_set, &c->flags);
-
-       /* Some options can't be parsed until after the fs is started: */
-       opts = bch2_opts_empty();
-       ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false);
-       if (ret)
-               goto err_stop_fs;
-
-       bch2_opts_apply(&c->opts, opts);
-
-       ret = bch2_fs_start(c);
-       if (ret)
-               goto err_stop_fs;
-
-       /*
-        * We might be doing a RO mount because other options required it, or we
-        * have no alloc info and it's a small image with no room to regenerate
-        * it
-        */
-       if (c->opts.read_only)
-               fc->sb_flags |= SB_RDONLY;
-
-       sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
-       ret = PTR_ERR_OR_ZERO(sb);
-       if (ret)
-               goto err_stop_fs;
-got_sb:
-       c = sb->s_fs_info;
-
-       if (sb->s_root) {
-               if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
-                       ret = -EBUSY;
-                       goto err_put_super;
-               }
-               goto out;
-       }
-
-       sb->s_blocksize         = block_bytes(c);
-       sb->s_blocksize_bits    = ilog2(block_bytes(c));
-       sb->s_maxbytes          = MAX_LFS_FILESIZE;
-       sb->s_op                = &bch_super_operations;
-       sb->s_export_op         = &bch_export_ops;
-#ifdef CONFIG_BCACHEFS_QUOTA
-       sb->s_qcop              = &bch2_quotactl_operations;
-       sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
-#endif
-       sb->s_xattr             = bch2_xattr_handlers;
-       sb->s_magic             = BCACHEFS_STATFS_MAGIC;
-       sb->s_time_gran         = c->sb.nsec_per_time_unit;
-       sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
-       sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
-       super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
-
-       if (c->sb.multi_device)
-               super_set_sysfs_name_uuid(sb);
-       else
-               strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name));
-
-       sb->s_shrink->seeks     = 0;
-       c->vfs_sb               = sb;
-       strscpy(sb->s_id, c->name, sizeof(sb->s_id));
-
-       ret = super_setup_bdi(sb);
-       if (ret)
-               goto err_put_super;
-
-       sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
-
-       scoped_guard(rcu) {
-               for_each_online_member_rcu(c, ca) {
-                       struct block_device *bdev = ca->disk_sb.bdev;
-
-                       /* XXX: create an anonymous device for multi device filesystems */
-                       sb->s_bdev      = bdev;
-                       sb->s_dev       = bdev->bd_dev;
-                       break;
-               }
-       }
-
-       c->dev = sb->s_dev;
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-       if (c->opts.acl)
-               sb->s_flags     |= SB_POSIXACL;
-#endif
-
-       sb->s_shrink->seeks = 0;
-
-#ifdef CONFIG_UNICODE
-       if (bch2_fs_casefold_enabled(c))
-               sb->s_encoding = c->cf_encoding;
-       generic_set_sb_d_ops(sb);
-#endif
-
-       vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
-       ret = PTR_ERR_OR_ZERO(vinode);
-       bch_err_msg(c, ret, "mounting: error getting root inode");
-       if (ret)
-               goto err_put_super;
-
-       sb->s_root = d_make_root(vinode);
-       if (!sb->s_root) {
-               bch_err(c, "error mounting: error allocating root dentry");
-               ret = -ENOMEM;
-               goto err_put_super;
-       }
-
-       sb->s_flags |= SB_ACTIVE;
-out:
-       fc->root = dget(sb->s_root);
-err:
-       darray_exit(&devs_to_fs);
-       bch2_darray_str_exit(&devs);
-       if (ret)
-               pr_err("error: %s", bch2_err_str(ret));
-       /*
-        * On an inconsistency error in recovery we might see an -EROFS derived
-        * errorcode (from the journal), but we don't want to return that to
-        * userspace as that causes util-linux to retry the mount RO - which is
-        * confusing:
-        */
-       if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
-               ret = -EIO;
-       return bch2_err_class(ret);
-
-err_stop_fs:
-       bch2_fs_stop(c);
-       goto err;
-
-err_put_super:
-       if (!sb->s_root)
-               __bch2_fs_stop(c);
-       deactivate_locked_super(sb);
-       goto err;
-}
-
-static void bch2_kill_sb(struct super_block *sb)
-{
-       struct bch_fs *c = sb->s_fs_info;
-
-       generic_shutdown_super(sb);
-       bch2_fs_free(c);
-}
-
-static void bch2_fs_context_free(struct fs_context *fc)
-{
-       struct bch2_opts_parse *opts = fc->fs_private;
-
-       if (opts) {
-               printbuf_exit(&opts->parse_later);
-               kfree(opts);
-       }
-}
-
-static int bch2_fs_parse_param(struct fs_context *fc,
-                              struct fs_parameter *param)
-{
-       /*
-        * the "source" param, i.e., the name of the device(s) to mount,
-        * is handled by the VFS layer.
-        */
-       if (!strcmp(param->key, "source"))
-               return -ENOPARAM;
-
-       struct bch2_opts_parse *opts = fc->fs_private;
-       struct bch_fs *c = NULL;
-
-       /* for reconfigure, we already have a struct bch_fs */
-       if (fc->root)
-               c = fc->root->d_sb->s_fs_info;
-
-       int ret = bch2_parse_one_mount_opt(c, &opts->opts,
-                                          &opts->parse_later, param->key,
-                                          param->string);
-       if (ret)
-               pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret));
-
-       return bch2_err_class(ret);
-}
-
-static int bch2_fs_reconfigure(struct fs_context *fc)
-{
-       struct super_block *sb = fc->root->d_sb;
-       struct bch2_opts_parse *opts = fc->fs_private;
-       struct bch_fs *c = sb->s_fs_info;
-       int ret = 0;
-
-       opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
-
-       if (opts->opts.read_only != c->opts.read_only) {
-               down_write(&c->state_lock);
-
-               if (opts->opts.read_only) {
-                       bch2_fs_read_only(c);
-
-                       sb->s_flags |= SB_RDONLY;
-               } else {
-                       ret = bch2_fs_read_write(c);
-                       if (ret) {
-                               bch_err(c, "error going rw: %i", ret);
-                               up_write(&c->state_lock);
-                               ret = -EINVAL;
-                               goto err;
-                       }
-
-                       sb->s_flags &= ~SB_RDONLY;
-               }
-
-               c->opts.read_only = opts->opts.read_only;
-
-               up_write(&c->state_lock);
-       }
-
-       if (opt_defined(opts->opts, errors))
-               c->opts.errors = opts->opts.errors;
-err:
-       return bch2_err_class(ret);
-}
-
-static const struct fs_context_operations bch2_context_ops = {
-       .free        = bch2_fs_context_free,
-       .parse_param = bch2_fs_parse_param,
-       .get_tree    = bch2_fs_get_tree,
-       .reconfigure = bch2_fs_reconfigure,
-};
-
-static int bch2_init_fs_context(struct fs_context *fc)
-{
-       struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
-
-       if (!opts)
-               return -ENOMEM;
-
-       opts->parse_later = PRINTBUF;
-
-       fc->ops = &bch2_context_ops;
-       fc->fs_private = opts;
-
-       return 0;
-}
-
-void bch2_fs_vfs_exit(struct bch_fs *c)
-{
-       if (c->vfs_inodes_by_inum_table.ht.tbl)
-               rhltable_destroy(&c->vfs_inodes_by_inum_table);
-       if (c->vfs_inodes_table.tbl)
-               rhashtable_destroy(&c->vfs_inodes_table);
-}
-
-int bch2_fs_vfs_init(struct bch_fs *c)
-{
-       return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?:
-               rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params);
-}
-
-static struct file_system_type bcache_fs_type = {
-       .owner                  = THIS_MODULE,
-       .name                   = "bcachefs",
-       .init_fs_context        = bch2_init_fs_context,
-       .kill_sb                = bch2_kill_sb,
-       .fs_flags               = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS,
-};
-
-MODULE_ALIAS_FS("bcachefs");
-
-void bch2_vfs_exit(void)
-{
-       unregister_filesystem(&bcache_fs_type);
-       kmem_cache_destroy(bch2_inode_cache);
-}
-
-int __init bch2_vfs_init(void)
-{
-       int ret = -ENOMEM;
-
-       bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
-                                     SLAB_ACCOUNT);
-       if (!bch2_inode_cache)
-               goto err;
-
-       ret = register_filesystem(&bcache_fs_type);
-       if (ret)
-               goto err;
-
-       return 0;
-err:
-       bch2_vfs_exit();
-       return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
deleted file mode 100644 (file)
index dd21985..0000000
+++ /dev/null
@@ -1,215 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_H
-#define _BCACHEFS_FS_H
-
-#include "inode.h"
-#include "opts.h"
-#include "str_hash.h"
-#include "quota_types.h"
-#include "two_state_shared_lock.h"
-
-#include <linux/seqlock.h>
-#include <linux/stat.h>
-
-struct bch_inode_info {
-       struct inode            v;
-       struct rhash_head       hash;
-       struct rhlist_head      by_inum_hash;
-       subvol_inum             ei_inum;
-
-       struct list_head        ei_vfs_inode_list;
-       unsigned long           ei_flags;
-
-       struct mutex            ei_update_lock;
-       u64                     ei_quota_reserved;
-       unsigned long           ei_last_dirtied;
-       two_state_lock_t        ei_pagecache_lock;
-
-       struct mutex            ei_quota_lock;
-       struct bch_qid          ei_qid;
-
-       /*
-        * When we've been doing nocow writes we'll need to issue flushes to the
-        * underlying block devices
-        *
-        * XXX: a device may have had a flush issued by some other codepath. It
-        * would be better to keep for each device a sequence number that's
-        * incremented when we isusue a cache flush, and track here the sequence
-        * number that needs flushing.
-        */
-       struct bch_devs_mask    ei_devs_need_flush;
-
-       /* copy of inode in btree: */
-       struct bch_inode_unpacked ei_inode;
-};
-
-#define bch2_pagecache_add_put(i)      bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
-#define bch2_pagecache_add_tryget(i)   bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
-#define bch2_pagecache_add_get(i)      bch2_two_state_lock(&i->ei_pagecache_lock, 0)
-
-#define bch2_pagecache_block_put(i)    bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
-#define bch2_pagecache_block_get(i)    bch2_two_state_lock(&i->ei_pagecache_lock, 1)
-
-static inline subvol_inum inode_inum(struct bch_inode_info *inode)
-{
-       return inode->ei_inum;
-}
-
-/*
- * Set if we've gotten a btree error for this inode, and thus the vfs inode and
- * btree inode may be inconsistent:
- */
-#define EI_INODE_ERROR                 0
-
-/*
- * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
- * those:
- */
-#define EI_INODE_SNAPSHOT              1
-#define EI_INODE_HASHED                        2
-
-#define to_bch_ei(_inode)                                      \
-       container_of_or_null(_inode, struct bch_inode_info, v)
-
-static inline int ptrcmp(void *l, void *r)
-{
-       return cmp_int(l, r);
-}
-
-enum bch_inode_lock_op {
-       INODE_PAGECACHE_BLOCK   = (1U << 0),
-       INODE_UPDATE_LOCK       = (1U << 1),
-};
-
-#define bch2_lock_inodes(_locks, ...)                                  \
-do {                                                                   \
-       struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };             \
-       unsigned i;                                                     \
-                                                                       \
-       bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);                  \
-                                                                       \
-       for (i = 1; i < ARRAY_SIZE(a); i++)                             \
-               if (a[i] != a[i - 1]) {                                 \
-                       if ((_locks) & INODE_PAGECACHE_BLOCK)           \
-                               bch2_pagecache_block_get(a[i]);\
-                       if ((_locks) & INODE_UPDATE_LOCK)                       \
-                               mutex_lock_nested(&a[i]->ei_update_lock, i);\
-               }                                                       \
-} while (0)
-
-#define bch2_unlock_inodes(_locks, ...)                                        \
-do {                                                                   \
-       struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };             \
-       unsigned i;                                                     \
-                                                                       \
-       bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);                  \
-                                                                       \
-       for (i = 1; i < ARRAY_SIZE(a); i++)                             \
-               if (a[i] != a[i - 1]) {                                 \
-                       if ((_locks) & INODE_PAGECACHE_BLOCK)           \
-                               bch2_pagecache_block_put(a[i]);\
-                       if ((_locks) & INODE_UPDATE_LOCK)                       \
-                               mutex_unlock(&a[i]->ei_update_lock);    \
-               }                                                       \
-} while (0)
-
-static inline struct bch_inode_info *file_bch_inode(struct file *file)
-{
-       return to_bch_ei(file_inode(file));
-}
-
-static inline bool inode_attr_changing(struct bch_inode_info *dir,
-                               struct bch_inode_info *inode,
-                               enum inode_opt_id id)
-{
-       return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
-               bch2_inode_opt_get(&dir->ei_inode, id) !=
-               bch2_inode_opt_get(&inode->ei_inode, id);
-}
-
-static inline bool inode_attrs_changing(struct bch_inode_info *dir,
-                                struct bch_inode_info *inode)
-{
-       unsigned id;
-
-       for (id = 0; id < Inode_opt_nr; id++)
-               if (inode_attr_changing(dir, inode, id))
-                       return true;
-
-       return false;
-}
-
-struct bch_inode_unpacked;
-
-#ifndef NO_BCACHEFS_FS
-
-struct bch_inode_info *
-__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
-             struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
-
-int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
-
-int bch2_fs_quota_transfer(struct bch_fs *,
-                          struct bch_inode_info *,
-                          struct bch_qid,
-                          unsigned,
-                          enum quota_acct_mode);
-
-static inline int bch2_set_projid(struct bch_fs *c,
-                                 struct bch_inode_info *inode,
-                                 u32 projid)
-{
-       struct bch_qid qid = inode->ei_qid;
-
-       qid.q[QTYP_PRJ] = projid;
-
-       return bch2_fs_quota_transfer(c, inode, qid,
-                                     1 << QTYP_PRJ,
-                                     KEY_TYPE_QUOTA_PREALLOC);
-}
-
-struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
-
-/* returns 0 if we want to do the update, or error is passed up */
-typedef int (*inode_set_fn)(struct btree_trans *,
-                           struct bch_inode_info *,
-                           struct bch_inode_unpacked *, void *);
-
-void bch2_inode_update_after_write(struct btree_trans *,
-                                  struct bch_inode_info *,
-                                  struct bch_inode_unpacked *,
-                                  unsigned);
-int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
-                                 inode_set_fn, void *, unsigned);
-
-int bch2_setattr_nonsize(struct mnt_idmap *,
-                        struct bch_inode_info *,
-                        struct iattr *);
-int __bch2_unlink(struct inode *, struct dentry *, bool);
-
-void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
-
-void bch2_fs_vfs_exit(struct bch_fs *);
-int bch2_fs_vfs_init(struct bch_fs *);
-
-void bch2_vfs_exit(void);
-int bch2_vfs_init(void);
-
-#else
-
-#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)       ({ do {} while (0); })
-
-static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
-
-static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
-                                              snapshot_id_list *s) {}
-
-static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
-static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
-
-static inline void bch2_vfs_exit(void) {}
-static inline int bch2_vfs_init(void) { return 0; }
-
-#endif /* NO_BCACHEFS_FS */
-
-#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
deleted file mode 100644 (file)
index 15c1e89..0000000
+++ /dev/null
@@ -1,3363 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "darray.h"
-#include "dirent.h"
-#include "error.h"
-#include "fs.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "keylist.h"
-#include "namei.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "super.h"
-#include "thread_with_file.h"
-#include "xattr.h"
-
-#include <linux/bsearch.h>
-#include <linux/dcache.h> /* struct qstr */
-
-static int dirent_points_to_inode_nowarn(struct bch_fs *c,
-                                        struct bkey_s_c_dirent d,
-                                        struct bch_inode_unpacked *inode)
-{
-       if (d.v->d_type == DT_SUBVOL
-           ? le32_to_cpu(d.v->d_child_subvol)  == inode->bi_subvol
-           : le64_to_cpu(d.v->d_inum)          == inode->bi_inum)
-               return 0;
-       return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
-}
-
-static void dirent_inode_mismatch_msg(struct printbuf *out,
-                                     struct bch_fs *c,
-                                     struct bkey_s_c_dirent dirent,
-                                     struct bch_inode_unpacked *inode)
-{
-       prt_str(out, "inode points to dirent that does not point back:");
-       prt_newline(out);
-       bch2_bkey_val_to_text(out, c, dirent.s_c);
-       prt_newline(out);
-       bch2_inode_unpacked_to_text(out, inode);
-}
-
-static int dirent_points_to_inode(struct bch_fs *c,
-                                 struct bkey_s_c_dirent dirent,
-                                 struct bch_inode_unpacked *inode)
-{
-       int ret = dirent_points_to_inode_nowarn(c, dirent, inode);
-       if (ret) {
-               struct printbuf buf = PRINTBUF;
-               dirent_inode_mismatch_msg(&buf, c, dirent, inode);
-               bch_warn(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-       }
-       return ret;
-}
-
-/*
- * XXX: this is handling transaction restarts without returning
- * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
- */
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
-                                   u32 snapshot)
-{
-       u64 sectors = 0;
-
-       int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
-                               SPOS(inum, 0, snapshot),
-                               POS(inum, U64_MAX),
-                               0, k, ({
-               if (bkey_extent_is_allocation(k.k))
-                       sectors += k.k->size;
-               0;
-       }));
-
-       return ret ?: sectors;
-}
-
-static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
-                                   u32 snapshot)
-{
-       u64 subdirs = 0;
-
-       int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
-                                   SPOS(inum, 0, snapshot),
-                                   POS(inum, U64_MAX),
-                                   0, k, ({
-               if (k.k->type == KEY_TYPE_dirent &&
-                   bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
-                       subdirs++;
-               0;
-       }));
-
-       return ret ?: subdirs;
-}
-
-static int subvol_lookup(struct btree_trans *trans, u32 subvol,
-                        u32 *snapshot, u64 *inum)
-{
-       struct bch_subvolume s;
-       int ret = bch2_subvolume_get(trans, subvol, false, &s);
-
-       *snapshot = le32_to_cpu(s.snapshot);
-       *inum = le64_to_cpu(s.inode);
-       return ret;
-}
-
-static int lookup_dirent_in_snapshot(struct btree_trans *trans,
-                          struct bch_hash_info hash_info,
-                          subvol_inum dir, struct qstr *name,
-                          u64 *target, unsigned *type, u32 snapshot)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
-                                                        &hash_info, dir, name, 0, snapshot);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-       *target = le64_to_cpu(d.v->d_inum);
-       *type = d.v->d_type;
-       bch2_trans_iter_exit(trans, &iter);
-       return 0;
-}
-
-/*
- * Find any subvolume associated with a tree of snapshots
- * We can't rely on master_subvol - it might have been deleted.
- */
-static int find_snapshot_tree_subvol(struct btree_trans *trans,
-                                    u32 tree_id, u32 *subvol)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
-               if (k.k->type != KEY_TYPE_snapshot)
-                       continue;
-
-               struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-               if (le32_to_cpu(s.v->tree) != tree_id)
-                       continue;
-
-               if (s.v->subvol) {
-                       *subvol = le32_to_cpu(s.v->subvol);
-                       goto found;
-               }
-       }
-       ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol);
-found:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
-                           struct bch_inode_unpacked *lostfound,
-                           u64 reattaching_inum)
-{
-       struct bch_fs *c = trans->c;
-       struct qstr lostfound_str = QSTR("lost+found");
-       struct btree_iter lostfound_iter = {};
-       u64 inum = 0;
-       unsigned d_type = 0;
-       int ret;
-
-       struct bch_snapshot_tree st;
-       ret = bch2_snapshot_tree_lookup(trans,
-                       bch2_snapshot_tree(c, snapshot), &st);
-       if (ret)
-               return ret;
-
-       u32 subvolid;
-       ret = find_snapshot_tree_subvol(trans,
-                               bch2_snapshot_tree(c, snapshot), &subvolid);
-       bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
-                   bch2_snapshot_tree(c, snapshot));
-       if (ret)
-               return ret;
-
-       struct bch_subvolume subvol;
-       ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
-       bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
-       if (ret)
-               return ret;
-
-       if (!subvol.inode) {
-               struct btree_iter iter;
-               struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
-                               BTREE_ID_subvolumes, POS(0, subvolid),
-                               0, subvolume);
-               ret = PTR_ERR_OR_ZERO(subvol);
-               if (ret)
-                       return ret;
-
-               subvol->v.inode = cpu_to_le64(reattaching_inum);
-               bch2_trans_iter_exit(trans, &iter);
-       }
-
-       subvol_inum root_inum = {
-               .subvol = subvolid,
-               .inum = le64_to_cpu(subvol.inode)
-       };
-
-       struct bch_inode_unpacked root_inode;
-       struct bch_hash_info root_hash_info;
-       ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0);
-       bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
-                   root_inum.inum, subvolid);
-       if (ret)
-               return ret;
-
-       root_hash_info = bch2_hash_info_init(c, &root_inode);
-
-       ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
-                             &lostfound_str, &inum, &d_type, snapshot);
-       if (bch2_err_matches(ret, ENOENT))
-               goto create_lostfound;
-
-       bch_err_fn(c, ret);
-       if (ret)
-               return ret;
-
-       if (d_type != DT_DIR) {
-               bch_err(c, "error looking up lost+found: not a directory");
-               return bch_err_throw(c, ENOENT_not_directory);
-       }
-
-       /*
-        * The bch2_check_dirents pass has already run, dangling dirents
-        * shouldn't exist here:
-        */
-       ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0);
-       bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
-                   inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
-       return ret;
-
-create_lostfound:
-       /*
-        * we always create lost+found in the root snapshot; we don't want
-        * different branches of the snapshot tree to have different lost+found
-        */
-       snapshot = le32_to_cpu(st.root_snapshot);
-       /*
-        * XXX: we could have a nicer log message here  if we had a nice way to
-        * walk backpointers to print a path
-        */
-       struct printbuf path = PRINTBUF;
-       ret = bch2_inum_to_path(trans, root_inum, &path);
-       if (ret)
-               goto err;
-
-       bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
-                  path.buf, root_inum.subvol, snapshot);
-       printbuf_exit(&path);
-
-       u64 now = bch2_current_time(c);
-       u64 cpu = raw_smp_processor_id();
-
-       bch2_inode_init_early(c, lostfound);
-       bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
-       lostfound->bi_dir = root_inode.bi_inum;
-       lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
-
-       root_inode.bi_nlink++;
-
-       ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
-       if (ret)
-               goto err;
-
-       bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
-       ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
-       if (ret)
-               goto err;
-
-       ret =   bch2_dirent_create_snapshot(trans,
-                               0, root_inode.bi_inum, snapshot, &root_hash_info,
-                               mode_to_type(lostfound->bi_mode),
-                               &lostfound_str,
-                               lostfound->bi_inum,
-                               &lostfound->bi_dir_offset,
-                               BTREE_UPDATE_internal_snapshot_node|
-                               STR_HASH_must_create) ?:
-               bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
-                                      BTREE_UPDATE_internal_snapshot_node);
-err:
-       bch_err_msg(c, ret, "creating lost+found");
-       bch2_trans_iter_exit(trans, &lostfound_iter);
-       return ret;
-}
-
-static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
-{
-       if (inode->bi_inum == BCACHEFS_ROOT_INO &&
-           inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
-               return false;
-
-       /*
-        * Subvolume roots are special: older versions of subvolume roots may be
-        * disconnected, it's only the newest version that matters.
-        *
-        * We only keep a single dirent pointing to a subvolume root, i.e.
-        * older versions of snapshots will not have a different dirent pointing
-        * to the same subvolume root.
-        *
-        * This is because dirents that point to subvolumes are only visible in
-        * the parent subvolume - versioning is not needed - and keeping them
-        * around would break fsck, because when we're crossing subvolumes we
-        * don't have a consistent snapshot ID to do check the inode <-> dirent
-        * relationships.
-        *
-        * Thus, a subvolume root that's been renamed after a snapshot will have
-        * a disconnected older version - that's expected.
-        *
-        * Note that taking a snapshot always updates the root inode (to update
-        * the dirent backpointer), so a subvolume root inode with
-        * BCH_INODE_has_child_snapshot is never visible.
-        */
-       if (inode->bi_subvol &&
-           (inode->bi_flags & BCH_INODE_has_child_snapshot))
-               return false;
-
-       return !bch2_inode_has_backpointer(inode) &&
-               !(inode->bi_flags & BCH_INODE_unlinked);
-}
-
-static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
-                                       SPOS(d_pos.inode, d_pos.offset, snapshot),
-                                       BTREE_ITER_intent|
-                                       BTREE_ITER_with_updates);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       if (bpos_eq(k.k->p, d_pos)) {
-               /*
-                * delet_at() doesn't work because the update path doesn't
-                * internally use BTREE_ITER_with_updates yet
-                */
-               struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
-               ret = PTR_ERR_OR_ZERO(k);
-               if (ret)
-                       goto err;
-
-               bkey_init(&k->k);
-               k->k.type = KEY_TYPE_whiteout;
-               k->k.p = iter.pos;
-               ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_inode_unpacked lostfound;
-       char name_buf[20];
-       int ret;
-
-       u32 dirent_snapshot = inode->bi_snapshot;
-       if (inode->bi_subvol) {
-               inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
-
-               struct btree_iter subvol_iter;
-               struct bkey_i_subvolume *subvol =
-                       bch2_bkey_get_mut_typed(trans, &subvol_iter,
-                                               BTREE_ID_subvolumes, POS(0, inode->bi_subvol),
-                                               0, subvolume);
-               ret = PTR_ERR_OR_ZERO(subvol);
-               if (ret)
-                       return ret;
-
-               subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL;
-               bch2_trans_iter_exit(trans, &subvol_iter);
-
-               u64 root_inum;
-               ret = subvol_lookup(trans, inode->bi_parent_subvol,
-                                   &dirent_snapshot, &root_inum);
-               if (ret)
-                       return ret;
-
-               snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
-       } else {
-               snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
-       }
-
-       ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
-       if (ret)
-               return ret;
-
-       bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum);
-
-       lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
-
-       /* ensure lost+found inode is also present in inode snapshot */
-       if (!inode->bi_subvol) {
-               BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
-               lostfound.bi_snapshot = inode->bi_snapshot;
-       }
-
-       ret = __bch2_fsck_write_inode(trans, &lostfound);
-       if (ret)
-               return ret;
-
-       struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
-       struct qstr name = QSTR(name_buf);
-
-       inode->bi_dir = lostfound.bi_inum;
-
-       ret = bch2_dirent_create_snapshot(trans,
-                               inode->bi_parent_subvol, lostfound.bi_inum,
-                               dirent_snapshot,
-                               &dir_hash,
-                               inode_d_type(inode),
-                               &name,
-                               inode->bi_subvol ?: inode->bi_inum,
-                               &inode->bi_dir_offset,
-                               BTREE_UPDATE_internal_snapshot_node|
-                               STR_HASH_must_create);
-       if (ret) {
-               bch_err_msg(c, ret, "error creating dirent");
-               return ret;
-       }
-
-       ret = __bch2_fsck_write_inode(trans, inode);
-       if (ret)
-               return ret;
-
-       {
-               CLASS(printbuf, buf)();
-               ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum,
-                                                inode->bi_snapshot, NULL, &buf);
-               if (ret)
-                       return ret;
-
-               bch_info(c, "reattached at %s", buf.buf);
-       }
-
-       /*
-        * Fix up inodes in child snapshots: if they should also be reattached
-        * update the backpointer field, if they should not be we need to emit
-        * whiteouts for the dirent we just created.
-        */
-       if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
-               snapshot_id_list whiteouts_done;
-               struct btree_iter iter;
-               struct bkey_s_c k;
-
-               darray_init(&whiteouts_done);
-
-               for_each_btree_key_reverse_norestart(trans, iter,
-                               BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
-                               BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
-                       if (k.k->p.offset != inode->bi_inum)
-                               break;
-
-                       if (!bkey_is_inode(k.k) ||
-                           !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
-                           snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
-                               continue;
-
-                       struct bch_inode_unpacked child_inode;
-                       ret = bch2_inode_unpack(k, &child_inode);
-                       if (ret)
-                               break;
-
-                       if (!inode_should_reattach(&child_inode)) {
-                               ret = maybe_delete_dirent(trans,
-                                                         SPOS(lostfound.bi_inum, inode->bi_dir_offset,
-                                                              dirent_snapshot),
-                                                         k.k->p.snapshot);
-                               if (ret)
-                                       break;
-
-                               ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
-                               if (ret)
-                                       break;
-                       } else {
-                               iter.snapshot = k.k->p.snapshot;
-                               child_inode.bi_dir = inode->bi_dir;
-                               child_inode.bi_dir_offset = inode->bi_dir_offset;
-
-                               ret = bch2_inode_write_flags(trans, &iter, &child_inode,
-                                                            BTREE_UPDATE_internal_snapshot_node);
-                               if (ret)
-                                       break;
-                       }
-               }
-               darray_exit(&whiteouts_done);
-               bch2_trans_iter_exit(trans, &iter);
-       }
-
-       return ret;
-}
-
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
-                                               struct btree_iter *iter,
-                                               struct bpos pos)
-{
-       return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
-static int remove_backpointer(struct btree_trans *trans,
-                             struct bch_inode_unpacked *inode)
-{
-       if (!bch2_inode_has_backpointer(inode))
-               return 0;
-
-       u32 snapshot = inode->bi_snapshot;
-
-       if (inode->bi_parent_subvol) {
-               int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot);
-               if (ret)
-                       return ret;
-       }
-
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
-                                    SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
-       int ret = bkey_err(d) ?:
-                 dirent_points_to_inode(c, d, inode) ?:
-                 bch2_fsck_remove_dirent(trans, d.k->p);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
-{
-       struct bch_fs *c = trans->c;
-
-       struct bch_inode_unpacked inode;
-       int ret = bch2_inode_find_by_inum_trans(trans,
-                               (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
-                               &inode);
-       if (ret)
-               return ret;
-
-       ret = remove_backpointer(trans, &inode);
-       if (!bch2_err_matches(ret, ENOENT))
-               bch_err_msg(c, ret, "removing dirent");
-       if (ret)
-               return ret;
-
-       ret = reattach_inode(trans, &inode);
-       bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
-       return ret;
-}
-
-static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
-{
-       struct bch_fs *c = trans->c;
-
-       if (!bch2_snapshot_is_leaf(c, snapshotid)) {
-               bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
-               return bch_err_throw(c, fsck_repair_unimplemented);
-       }
-
-       /*
-        * If inum isn't set, that means we're being called from check_dirents,
-        * not check_inodes - the root of this subvolume doesn't exist or we
-        * would have found it there:
-        */
-       if (!inum) {
-               struct btree_iter inode_iter = {};
-               struct bch_inode_unpacked new_inode;
-               u64 cpu = raw_smp_processor_id();
-
-               bch2_inode_init_early(c, &new_inode);
-               bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
-
-               new_inode.bi_subvol = subvolid;
-
-               int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
-                         bch2_btree_iter_traverse(trans, &inode_iter) ?:
-                         bch2_inode_write(trans, &inode_iter, &new_inode);
-               bch2_trans_iter_exit(trans, &inode_iter);
-               if (ret)
-                       return ret;
-
-               inum = new_inode.bi_inum;
-       }
-
-       bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
-
-       struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
-       int ret = PTR_ERR_OR_ZERO(new_subvol);
-       if (ret)
-               return ret;
-
-       bkey_subvolume_init(&new_subvol->k_i);
-       new_subvol->k.p.offset  = subvolid;
-       new_subvol->v.snapshot  = cpu_to_le32(snapshotid);
-       new_subvol->v.inode     = cpu_to_le64(inum);
-       ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
-       if (ret)
-               return ret;
-
-       struct btree_iter iter;
-       struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
-                       BTREE_ID_snapshots, POS(0, snapshotid),
-                       0, snapshot);
-       ret = PTR_ERR_OR_ZERO(s);
-       bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
-       if (ret)
-               return ret;
-
-       u32 snapshot_tree = le32_to_cpu(s->v.tree);
-
-       s->v.subvol = cpu_to_le32(subvolid);
-       SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
-       bch2_trans_iter_exit(trans, &iter);
-
-       struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
-                       BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
-                       0, snapshot_tree);
-       ret = PTR_ERR_OR_ZERO(st);
-       bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
-       if (ret)
-               return ret;
-
-       if (!st->v.master_subvol)
-               st->v.master_subvol = cpu_to_le32(subvolid);
-
-       bch2_trans_iter_exit(trans, &iter);
-       return 0;
-}
-
-static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
-{
-       struct bch_fs *c = trans->c;
-       unsigned i_mode = S_IFREG;
-       u64 i_size = 0;
-
-       switch (btree) {
-       case BTREE_ID_extents: {
-               struct btree_iter iter = {};
-
-               bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
-               struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
-               bch2_trans_iter_exit(trans, &iter);
-               int ret = bkey_err(k);
-               if (ret)
-                       return ret;
-
-               i_size = k.k->p.offset << 9;
-               break;
-       }
-       case BTREE_ID_dirents:
-               i_mode = S_IFDIR;
-               break;
-       case BTREE_ID_xattrs:
-               break;
-       default:
-               BUG();
-       }
-
-       struct bch_inode_unpacked new_inode;
-       bch2_inode_init_early(c, &new_inode);
-       bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
-       new_inode.bi_size = i_size;
-       new_inode.bi_inum = inum;
-       new_inode.bi_snapshot = snapshot;
-
-       return __bch2_fsck_write_inode(trans, &new_inode);
-}
-
-static inline void snapshots_seen_exit(struct snapshots_seen *s)
-{
-       darray_exit(&s->ids);
-}
-
-static inline void snapshots_seen_init(struct snapshots_seen *s)
-{
-       memset(s, 0, sizeof(*s));
-}
-
-static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
-{
-       u32 *i;
-       __darray_for_each(s->ids, i) {
-               if (*i == id)
-                       return 0;
-               if (*i > id)
-                       break;
-       }
-
-       int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
-       if (ret)
-               bch_err(c, "error reallocating snapshots_seen table (size %zu)",
-                       s->ids.size);
-       return ret;
-}
-
-static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
-                                enum btree_id btree_id, struct bpos pos)
-{
-       if (!bkey_eq(s->pos, pos))
-               s->ids.nr = 0;
-       s->pos = pos;
-
-       return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
-}
-
-/**
- * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
- * and @ancestor hasn't been overwritten in @seen
- *
- * @c:         filesystem handle
- * @seen:      list of snapshot ids already seen at current position
- * @id:                descendent snapshot id
- * @ancestor:  ancestor snapshot id
- *
- * Returns:    whether key in @ancestor snapshot is visible in @id snapshot
- */
-static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
-                                   u32 id, u32 ancestor)
-{
-       EBUG_ON(id > ancestor);
-
-       if (id == ancestor)
-               return true;
-
-       if (!bch2_snapshot_is_ancestor(c, id, ancestor))
-               return false;
-
-       /*
-        * We know that @id is a descendant of @ancestor, we're checking if
-        * we've seen a key that overwrote @ancestor - i.e. also a descendent of
-        * @ascestor and with @id as a descendent.
-        *
-        * But we already know that we're scanning IDs between @id and @ancestor
-        * numerically, since snapshot ID lists are kept sorted, so if we find
-        * an id that's an ancestor of @id we're done:
-        */
-       darray_for_each_reverse(seen->ids, i)
-               if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i))
-                       return false;
-
-       return true;
-}
-
-/**
- * ref_visible - given a key with snapshot id @src that points to a key with
- * snapshot id @dst, test whether there is some snapshot in which @dst is
- * visible.
- *
- * @c:         filesystem handle
- * @s:         list of snapshot IDs already seen at @src
- * @src:       snapshot ID of src key
- * @dst:       snapshot ID of dst key
- * Returns:    true if there is some snapshot in which @dst is visible
- *
- * Assumes we're visiting @src keys in natural key order
- */
-static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
-                       u32 src, u32 dst)
-{
-       return dst <= src
-               ? key_visible_in_snapshot(c, s, dst, src)
-               : bch2_snapshot_is_ancestor(c, src, dst);
-}
-
-static int ref_visible2(struct bch_fs *c,
-                       u32 src, struct snapshots_seen *src_seen,
-                       u32 dst, struct snapshots_seen *dst_seen)
-{
-       if (dst > src) {
-               swap(dst, src);
-               swap(dst_seen, src_seen);
-       }
-       return key_visible_in_snapshot(c, src_seen, dst, src);
-}
-
-#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                              \
-       for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
-            (_i)->inode.bi_snapshot <= (_snapshot); _i++)                              \
-               if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot))
-
-struct inode_walker_entry {
-       struct bch_inode_unpacked inode;
-       bool                    whiteout;
-       u64                     count;
-       u64                     i_size;
-};
-
-struct inode_walker {
-       bool                            first_this_inode;
-       bool                            have_inodes;
-       bool                            recalculate_sums;
-       struct bpos                     last_pos;
-
-       DARRAY(struct inode_walker_entry) inodes;
-       snapshot_id_list                deletes;
-};
-
-static void inode_walker_exit(struct inode_walker *w)
-{
-       darray_exit(&w->inodes);
-       darray_exit(&w->deletes);
-}
-
-static struct inode_walker inode_walker_init(void)
-{
-       return (struct inode_walker) { 0, };
-}
-
-static int add_inode(struct bch_fs *c, struct inode_walker *w,
-                    struct bkey_s_c inode)
-{
-       int ret = darray_push(&w->inodes, ((struct inode_walker_entry) {
-               .whiteout       = !bkey_is_inode(inode.k),
-       }));
-       if (ret)
-               return ret;
-
-       struct inode_walker_entry *n = &darray_last(w->inodes);
-       if (!n->whiteout) {
-               return bch2_inode_unpack(inode, &n->inode);
-       } else {
-               n->inode.bi_inum        = inode.k->p.offset;
-               n->inode.bi_snapshot    = inode.k->p.snapshot;
-               return 0;
-       }
-}
-
-static int get_inodes_all_snapshots(struct btree_trans *trans,
-                                   struct inode_walker *w, u64 inum)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       /*
-        * We no longer have inodes for w->last_pos; clear this to avoid
-        * screwing up check_i_sectors/check_subdir_count if we take a
-        * transaction restart here:
-        */
-       w->have_inodes = false;
-       w->recalculate_sums = false;
-       w->inodes.nr = 0;
-
-       for_each_btree_key_max_norestart(trans, iter,
-                       BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX),
-                       BTREE_ITER_all_snapshots, k, ret) {
-               ret = add_inode(c, w, k);
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (ret)
-               return ret;
-
-       w->first_this_inode = true;
-       w->have_inodes = true;
-       return 0;
-}
-
-static int get_visible_inodes(struct btree_trans *trans,
-                             struct inode_walker *w,
-                             struct snapshots_seen *s,
-                             u64 inum)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       w->inodes.nr = 0;
-       w->deletes.nr = 0;
-
-       for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
-                          BTREE_ITER_all_snapshots, k, ret) {
-               if (k.k->p.offset != inum)
-                       break;
-
-               if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
-                       continue;
-
-               if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
-                       continue;
-
-               ret = bkey_is_inode(k.k)
-                       ? add_inode(c, w, k)
-                       : snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-
-       struct inode_walker_entry *i = darray_find_p(w->inodes, i,
-                   bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot));
-
-       if (!i)
-               return NULL;
-
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
-                       trans, snapshot_key_missing_inode_snapshot,
-                        "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
-                        "unexpected because we should always update the inode when we update a key in that inode\n"
-                        "%s",
-                        w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
-                        (bch2_bkey_val_to_text(&buf, c, k),
-                         buf.buf))) {
-               if (!i->whiteout) {
-                       struct bch_inode_unpacked new = i->inode;
-                       new.bi_snapshot = k.k->p.snapshot;
-                       ret = __bch2_fsck_write_inode(trans, &new);
-               } else {
-                       struct bkey_i whiteout;
-                       bkey_init(&whiteout.k);
-                       whiteout.k.type = KEY_TYPE_whiteout;
-                       whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot);
-                       ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
-                                                         &whiteout,
-                                                         BTREE_UPDATE_internal_snapshot_node);
-               }
-
-               if (ret)
-                       goto fsck_err;
-
-               ret = bch2_trans_commit(trans, NULL, NULL, 0);
-               if (ret)
-                       goto fsck_err;
-
-               struct inode_walker_entry new_entry = *i;
-
-               new_entry.inode.bi_snapshot     = k.k->p.snapshot;
-               new_entry.count                 = 0;
-               new_entry.i_size                = 0;
-
-               while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot)
-                       --i;
-
-               size_t pos = i - w->inodes.data;
-               ret = darray_insert_item(&w->inodes, pos, new_entry);
-               if (ret)
-                       goto fsck_err;
-
-               ret = bch_err_throw(c, transaction_restart_nested);
-               goto fsck_err;
-       }
-
-       printbuf_exit(&buf);
-       return i;
-fsck_err:
-       printbuf_exit(&buf);
-       return ERR_PTR(ret);
-}
-
-static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
-                                            struct inode_walker *w,
-                                            struct bkey_s_c k)
-{
-       if (w->last_pos.inode != k.k->p.inode) {
-               int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
-               if (ret)
-                       return ERR_PTR(ret);
-       }
-
-       w->last_pos = k.k->p;
-
-       return lookup_inode_for_snapshot(trans, w, k);
-}
-
-/*
- * Prefer to delete the first one, since that will be the one at the wrong
- * offset:
- * return value: 0 -> delete k1, 1 -> delete k2
- */
-int bch2_fsck_update_backpointers(struct btree_trans *trans,
-                                 struct snapshots_seen *s,
-                                 const struct bch_hash_desc desc,
-                                 struct bch_hash_info *hash_info,
-                                 struct bkey_i *new)
-{
-       if (new->k.type != KEY_TYPE_dirent)
-               return 0;
-
-       struct bkey_i_dirent *d = bkey_i_to_dirent(new);
-       struct inode_walker target = inode_walker_init();
-       int ret = 0;
-
-       if (d->v.d_type == DT_SUBVOL) {
-               bch_err(trans->c, "%s does not support DT_SUBVOL", __func__);
-               ret = -BCH_ERR_fsck_repair_unimplemented;
-       } else {
-               ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
-               if (ret)
-                       goto err;
-
-               darray_for_each(target.inodes, i) {
-                       i->inode.bi_dir_offset = d->k.p.offset;
-                       ret = __bch2_fsck_write_inode(trans, &i->inode);
-                       if (ret)
-                               goto err;
-               }
-       }
-err:
-       inode_walker_exit(&target);
-       return ret;
-}
-
-static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
-                                              struct btree_iter *iter,
-                                              struct bch_inode_unpacked *inode,
-                                              u32 *snapshot)
-{
-       if (inode->bi_subvol) {
-               u64 inum;
-               int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
-               if (ret)
-                       return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
-       }
-
-       return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
-}
-
-static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
-       int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int check_inode_dirent_inode(struct btree_trans *trans,
-                                   struct bch_inode_unpacked *inode,
-                                   bool *write_inode)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-
-       u32 inode_snapshot = inode->bi_snapshot;
-       struct btree_iter dirent_iter = {};
-       struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
-       int ret = bkey_err(d);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               return ret;
-
-       if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) &&
-           inode->bi_subvol &&
-           (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
-               /* Older version of a renamed subvolume root: we won't have a
-                * correct dirent for it. That's expected, see
-                * inode_should_reattach().
-                *
-                * We don't clear the backpointer field when doing the rename
-                * because there might be arbitrarily many versions in older
-                * snapshots.
-                */
-               inode->bi_dir = 0;
-               inode->bi_dir_offset = 0;
-               *write_inode = true;
-               goto out;
-       }
-
-       if (fsck_err_on(ret,
-                       trans, inode_points_to_missing_dirent,
-                       "inode points to missing dirent\n%s",
-                       (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
-           fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode),
-                       trans, inode_points_to_wrong_dirent,
-                       "%s",
-                       (printbuf_reset(&buf),
-                        dirent_inode_mismatch_msg(&buf, c, d, inode),
-                        buf.buf))) {
-               /*
-                * We just clear the backpointer fields for now. If we find a
-                * dirent that points to this inode in check_dirents(), we'll
-                * update it then; then when we get to check_path() if the
-                * backpointer is still 0 we'll reattach it.
-                */
-               inode->bi_dir = 0;
-               inode->bi_dir_offset = 0;
-               *write_inode = true;
-       }
-out:
-       ret = 0;
-fsck_err:
-       bch2_trans_iter_exit(trans, &dirent_iter);
-       printbuf_exit(&buf);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int check_inode(struct btree_trans *trans,
-                      struct btree_iter *iter,
-                      struct bkey_s_c k,
-                      struct bch_inode_unpacked *snapshot_root,
-                      struct snapshots_seen *s)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       struct bch_inode_unpacked u;
-       bool do_update = false;
-       int ret;
-
-       ret = bch2_check_key_has_snapshot(trans, iter, k);
-       if (ret < 0)
-               goto err;
-       if (ret)
-               return 0;
-
-       ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
-       if (ret)
-               goto err;
-
-       if (!bkey_is_inode(k.k))
-               return 0;
-
-       ret = bch2_inode_unpack(k, &u);
-       if (ret)
-               goto err;
-
-       if (snapshot_root->bi_inum != u.bi_inum) {
-               ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root);
-               if (ret)
-                       goto err;
-       }
-
-       if (u.bi_hash_seed      != snapshot_root->bi_hash_seed ||
-           INODE_STR_HASH(&u)  != INODE_STR_HASH(snapshot_root)) {
-               ret = bch2_repair_inode_hash_info(trans, snapshot_root);
-               BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented);
-               if (ret)
-                       goto err;
-       }
-
-       ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update);
-       if (ret)
-               goto err;
-
-       if (bch2_inode_has_backpointer(&u)) {
-               ret = check_inode_dirent_inode(trans, &u, &do_update);
-               if (ret)
-                       goto err;
-       }
-
-       if (fsck_err_on(bch2_inode_has_backpointer(&u) &&
-                       (u.bi_flags & BCH_INODE_unlinked),
-                       trans, inode_unlinked_but_has_dirent,
-                       "inode unlinked but has dirent\n%s",
-                       (printbuf_reset(&buf),
-                        bch2_inode_unpacked_to_text(&buf, &u),
-                        buf.buf))) {
-               u.bi_flags &= ~BCH_INODE_unlinked;
-               do_update = true;
-       }
-
-       if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
-               /* Check for this early so that check_unreachable_inode() will reattach it */
-
-               ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
-               if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
-                       goto err;
-
-               fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
-                           "dir unlinked but not empty\n%s",
-                           (printbuf_reset(&buf),
-                            bch2_inode_unpacked_to_text(&buf, &u),
-                            buf.buf));
-               u.bi_flags &= ~BCH_INODE_unlinked;
-               do_update = true;
-               ret = 0;
-       }
-
-       if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size,
-                       trans, inode_dir_has_nonzero_i_size,
-                       "directory %llu:%u with nonzero i_size %lli",
-                       u.bi_inum, u.bi_snapshot, u.bi_size)) {
-               u.bi_size = 0;
-               do_update = true;
-       }
-
-       ret = bch2_inode_has_child_snapshots(trans, k.k->p);
-       if (ret < 0)
-               goto err;
-
-       if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
-                       trans, inode_has_child_snapshots_wrong,
-                       "inode has_child_snapshots flag wrong (should be %u)\n%s",
-                       ret,
-                       (printbuf_reset(&buf),
-                        bch2_inode_unpacked_to_text(&buf, &u),
-                        buf.buf))) {
-               if (ret)
-                       u.bi_flags |= BCH_INODE_has_child_snapshot;
-               else
-                       u.bi_flags &= ~BCH_INODE_has_child_snapshot;
-               do_update = true;
-       }
-       ret = 0;
-
-       if ((u.bi_flags & BCH_INODE_unlinked) &&
-           !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
-               if (!test_bit(BCH_FS_started, &c->flags)) {
-                       /*
-                        * If we're not in online fsck, don't delete unlinked
-                        * inodes, just make sure they're on the deleted list.
-                        *
-                        * They might be referred to by a logged operation -
-                        * i.e. we might have crashed in the middle of a
-                        * truncate on an unlinked but open file - so we want to
-                        * let the delete_dead_inodes kill it after resuming
-                        * logged ops.
-                        */
-                       ret = check_inode_deleted_list(trans, k.k->p);
-                       if (ret < 0)
-                               goto err_noprint;
-
-                       fsck_err_on(!ret,
-                                   trans, unlinked_inode_not_on_deleted_list,
-                                   "inode %llu:%u unlinked, but not on deleted list",
-                                   u.bi_inum, k.k->p.snapshot);
-
-                       ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
-                       if (ret)
-                               goto err;
-               } else {
-                       ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
-                       if (ret < 0)
-                               goto err;
-
-                       if (fsck_err_on(!ret,
-                                       trans, inode_unlinked_and_not_open,
-                                     "inode %llu:%u unlinked and not open",
-                                     u.bi_inum, u.bi_snapshot)) {
-                               ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
-                               bch_err_msg(c, ret, "in fsck deleting inode");
-                               goto err_noprint;
-                       }
-                       ret = 0;
-               }
-       }
-
-       if (fsck_err_on(u.bi_parent_subvol &&
-                       (u.bi_subvol == 0 ||
-                        u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
-                       trans, inode_bi_parent_nonzero,
-                       "inode %llu:%u has subvol %u but nonzero parent subvol %u",
-                       u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
-               u.bi_parent_subvol = 0;
-               do_update = true;
-       }
-
-       if (u.bi_subvol) {
-               struct bch_subvolume s;
-
-               ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
-               if (ret && !bch2_err_matches(ret, ENOENT))
-                       goto err;
-
-               if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
-                       ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
-                       goto do_update;
-               }
-
-               if (fsck_err_on(ret,
-                               trans, inode_bi_subvol_missing,
-                               "inode %llu:%u bi_subvol points to missing subvolume %u",
-                               u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
-                   fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
-                               !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
-                                                          k.k->p.snapshot),
-                               trans, inode_bi_subvol_wrong,
-                               "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
-                               u.bi_inum, k.k->p.snapshot, u.bi_subvol,
-                               le64_to_cpu(s.inode),
-                               le32_to_cpu(s.snapshot))) {
-                       u.bi_subvol = 0;
-                       u.bi_parent_subvol = 0;
-                       do_update = true;
-               }
-       }
-
-       if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
-                       trans, inode_journal_seq_in_future,
-                       "inode journal seq in future (currently at %llu)\n%s",
-                       journal_cur_seq(&c->journal),
-                       (printbuf_reset(&buf),
-                        bch2_inode_unpacked_to_text(&buf, &u),
-                       buf.buf))) {
-               u.bi_journal_seq = journal_cur_seq(&c->journal);
-               do_update = true;
-       }
-do_update:
-       if (do_update) {
-               ret = __bch2_fsck_write_inode(trans, &u);
-               bch_err_msg(c, ret, "in fsck updating inode");
-               if (ret)
-                       goto err_noprint;
-       }
-err:
-fsck_err:
-       bch_err_fn(c, ret);
-err_noprint:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_check_inodes(struct bch_fs *c)
-{
-       struct bch_inode_unpacked snapshot_root = {};
-       struct snapshots_seen s;
-
-       snapshots_seen_init(&s);
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
-                               POS_MIN,
-                               BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       check_inode(trans, &iter, k, &snapshot_root, &s)));
-
-       snapshots_seen_exit(&s);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
-                                           struct bch_inode_unpacked *inode)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       /*
-        * We look for inodes to reattach in natural key order, leaves first,
-        * but we should do the reattach at the oldest version that needs to be
-        * reattached:
-        */
-       for_each_btree_key_norestart(trans, iter,
-                                    BTREE_ID_inodes,
-                                    SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
-                                    BTREE_ITER_all_snapshots, k, ret) {
-               if (k.k->p.offset != inode->bi_inum)
-                       break;
-
-               if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
-                       continue;
-
-               if (!bkey_is_inode(k.k))
-                       break;
-
-               struct bch_inode_unpacked parent_inode;
-               ret = bch2_inode_unpack(k, &parent_inode);
-               if (ret)
-                       break;
-
-               if (!inode_should_reattach(&parent_inode))
-                       break;
-
-               *inode = parent_inode;
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-static int check_unreachable_inode(struct btree_trans *trans,
-                                  struct btree_iter *iter,
-                                  struct bkey_s_c k)
-{
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       if (!bkey_is_inode(k.k))
-               return 0;
-
-       struct bch_inode_unpacked inode;
-       ret = bch2_inode_unpack(k, &inode);
-       if (ret)
-               return ret;
-
-       if (!inode_should_reattach(&inode))
-               return 0;
-
-       ret = find_oldest_inode_needs_reattach(trans, &inode);
-       if (ret)
-               return ret;
-
-       if (fsck_err(trans, inode_unreachable,
-                    "unreachable inode:\n%s",
-                    (bch2_inode_unpacked_to_text(&buf, &inode),
-                     buf.buf)))
-               ret = reattach_inode(trans, &inode);
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-/*
- * Reattach unreachable (but not unlinked) inodes
- *
- * Run after check_inodes() and check_dirents(), so we node that inode
- * backpointer fields point to valid dirents, and every inode that has a dirent
- * that points to it has its backpointer field set - so we're just looking for
- * non-unlinked inodes without backpointers:
- *
- * XXX: this is racy w.r.t. hardlink removal in online fsck
- */
-int bch2_check_unreachable_inodes(struct bch_fs *c)
-{
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
-                               POS_MIN,
-                               BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       check_unreachable_inode(trans, &iter, k)));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
-{
-       switch (btree) {
-       case BTREE_ID_extents:
-               return S_ISREG(mode) || S_ISLNK(mode);
-       case BTREE_ID_dirents:
-               return S_ISDIR(mode);
-       case BTREE_ID_xattrs:
-               return true;
-       default:
-               BUG();
-       }
-}
-
-static int check_key_has_inode(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              struct inode_walker *inode,
-                              struct inode_walker_entry *i,
-                              struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       struct btree_iter iter2 = {};
-       int ret = PTR_ERR_OR_ZERO(i);
-       if (ret)
-               return ret;
-
-       if (k.k->type == KEY_TYPE_whiteout)
-               goto out;
-
-       bool have_inode = i && !i->whiteout;
-
-       if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes)))
-               goto reconstruct;
-
-       if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode))
-               goto out;
-
-       prt_printf(&buf, ", ");
-
-       bool have_old_inode = false;
-       darray_for_each(inode->inodes, i2)
-               if (!i2->whiteout &&
-                   bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) &&
-                   btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) {
-                       prt_printf(&buf, "but found good inode in older snapshot\n");
-                       bch2_inode_unpacked_to_text(&buf, &i2->inode);
-                       prt_newline(&buf);
-                       have_old_inode = true;
-                       break;
-               }
-
-       struct bkey_s_c k2;
-       unsigned nr_keys = 0;
-
-       prt_printf(&buf, "found keys:\n");
-
-       for_each_btree_key_max_norestart(trans, iter2, iter->btree_id,
-                                        SPOS(k.k->p.inode, 0, k.k->p.snapshot),
-                                        POS(k.k->p.inode, U64_MAX),
-                                        0, k2, ret) {
-               nr_keys++;
-               if (nr_keys <= 10) {
-                       bch2_bkey_val_to_text(&buf, c, k2);
-                       prt_newline(&buf);
-               }
-               if (nr_keys >= 100)
-                       break;
-       }
-
-       if (ret)
-               goto err;
-
-       if (nr_keys > 100)
-               prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys);
-       else if (nr_keys > 10)
-               prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys);
-
-       if (!have_inode) {
-               if (fsck_err_on(!have_inode,
-                               trans, key_in_missing_inode,
-                               "key in missing inode%s", buf.buf)) {
-                       /*
-                        * Maybe a deletion that raced with data move, or something
-                        * weird like that? But if we know the inode was deleted, or
-                        * it's just a few keys, we can safely delete them.
-                        *
-                        * If it's many keys, we should probably recreate the inode
-                        */
-                       if (have_old_inode || nr_keys <= 2)
-                               goto delete;
-                       else
-                               goto reconstruct;
-               }
-       } else {
-               /*
-                * not autofix, this one would be a giant wtf - bit error in the
-                * inode corrupting i_mode?
-                *
-                * may want to try repairing inode instead of deleting
-                */
-               if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
-                               trans, key_in_wrong_inode_type,
-                               "key for wrong inode mode %o%s",
-                               i->inode.bi_mode, buf.buf))
-                       goto delete;
-       }
-out:
-err:
-fsck_err:
-       bch2_trans_iter_exit(trans, &iter2);
-       printbuf_exit(&buf);
-       bch_err_fn(c, ret);
-       return ret;
-delete:
-       /*
-        * XXX: print out more info
-        * count up extents for this inode, check if we have different inode in
-        * an older snapshot version, perhaps decide if we want to reconstitute
-        */
-       ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
-       goto out;
-reconstruct:
-       ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
-               bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-       if (ret)
-               goto err;
-
-       inode->last_pos.inode--;
-       ret = bch_err_throw(c, transaction_restart_nested);
-       goto out;
-}
-
-static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
-       struct bch_fs *c = trans->c;
-       int ret = 0;
-       s64 count2;
-
-       darray_for_each(w->inodes, i) {
-               if (i->inode.bi_sectors == i->count)
-                       continue;
-
-               count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot);
-
-               if (w->recalculate_sums)
-                       i->count = count2;
-
-               if (i->count != count2) {
-                       bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
-                                           w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
-                       i->count = count2;
-               }
-
-               if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
-                               trans, inode_i_sectors_wrong,
-                               "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
-                               w->last_pos.inode, i->inode.bi_snapshot,
-                               i->inode.bi_sectors, i->count)) {
-                       i->inode.bi_sectors = i->count;
-                       ret = bch2_fsck_write_inode(trans, &i->inode);
-                       if (ret)
-                               break;
-               }
-       }
-fsck_err:
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
-{
-       u32 restart_count = trans->restart_count;
-       return check_i_sectors_notnested(trans, w) ?:
-               trans_was_restarted(trans, restart_count);
-}
-
-struct extent_end {
-       u32                     snapshot;
-       u64                     offset;
-       struct snapshots_seen   seen;
-};
-
-struct extent_ends {
-       struct bpos                     last_pos;
-       DARRAY(struct extent_end)       e;
-};
-
-static void extent_ends_reset(struct extent_ends *extent_ends)
-{
-       darray_for_each(extent_ends->e, i)
-               snapshots_seen_exit(&i->seen);
-       extent_ends->e.nr = 0;
-}
-
-static void extent_ends_exit(struct extent_ends *extent_ends)
-{
-       extent_ends_reset(extent_ends);
-       darray_exit(&extent_ends->e);
-}
-
-static void extent_ends_init(struct extent_ends *extent_ends)
-{
-       memset(extent_ends, 0, sizeof(*extent_ends));
-}
-
-static int extent_ends_at(struct bch_fs *c,
-                         struct extent_ends *extent_ends,
-                         struct snapshots_seen *seen,
-                         struct bkey_s_c k)
-{
-       struct extent_end *i, n = (struct extent_end) {
-               .offset         = k.k->p.offset,
-               .snapshot       = k.k->p.snapshot,
-               .seen           = *seen,
-       };
-
-       n.seen.ids.data = kmemdup(seen->ids.data,
-                             sizeof(seen->ids.data[0]) * seen->ids.size,
-                             GFP_KERNEL);
-       if (!n.seen.ids.data)
-               return bch_err_throw(c, ENOMEM_fsck_extent_ends_at);
-
-       __darray_for_each(extent_ends->e, i) {
-               if (i->snapshot == k.k->p.snapshot) {
-                       snapshots_seen_exit(&i->seen);
-                       *i = n;
-                       return 0;
-               }
-
-               if (i->snapshot >= k.k->p.snapshot)
-                       break;
-       }
-
-       return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
-}
-
-static int overlapping_extents_found(struct btree_trans *trans,
-                                    enum btree_id btree,
-                                    struct bpos pos1, struct snapshots_seen *pos1_seen,
-                                    struct bkey pos2,
-                                    bool *fixed,
-                                    struct extent_end *extent_end)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       struct btree_iter iter1, iter2 = {};
-       struct bkey_s_c k1, k2;
-       int ret;
-
-       BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
-
-       bch2_trans_iter_init(trans, &iter1, btree, pos1,
-                            BTREE_ITER_all_snapshots|
-                            BTREE_ITER_not_extents);
-       k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
-       ret = bkey_err(k1);
-       if (ret)
-               goto err;
-
-       prt_newline(&buf);
-       bch2_bkey_val_to_text(&buf, c, k1);
-
-       if (!bpos_eq(pos1, k1.k->p)) {
-               prt_str(&buf, "\nwanted\n  ");
-               bch2_bpos_to_text(&buf, pos1);
-               prt_str(&buf, "\n");
-               bch2_bkey_to_text(&buf, &pos2);
-
-               bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
-                       __func__, buf.buf);
-               ret = bch_err_throw(c, internal_fsck_err);
-               goto err;
-       }
-
-       bch2_trans_copy_iter(trans, &iter2, &iter1);
-
-       while (1) {
-               bch2_btree_iter_advance(trans, &iter2);
-
-               k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
-               ret = bkey_err(k2);
-               if (ret)
-                       goto err;
-
-               if (bpos_ge(k2.k->p, pos2.p))
-                       break;
-       }
-
-       prt_newline(&buf);
-       bch2_bkey_val_to_text(&buf, c, k2);
-
-       if (bpos_gt(k2.k->p, pos2.p) ||
-           pos2.size != k2.k->size) {
-               bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
-                       __func__, buf.buf);
-               ret = bch_err_throw(c, internal_fsck_err);
-               goto err;
-       }
-
-       prt_printf(&buf, "\noverwriting %s extent",
-                  pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
-
-       if (fsck_err(trans, extent_overlapping,
-                    "overlapping extents%s", buf.buf)) {
-               struct btree_iter *old_iter = &iter1;
-               struct disk_reservation res = { 0 };
-
-               if (pos1.snapshot < pos2.p.snapshot) {
-                       old_iter = &iter2;
-                       swap(k1, k2);
-               }
-
-               trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
-
-               ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
-                               BTREE_UPDATE_internal_snapshot_node,
-                               k1, k2) ?:
-                       bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
-               bch2_disk_reservation_put(c, &res);
-
-               bch_info(c, "repair ret %s", bch2_err_str(ret));
-
-               if (ret)
-                       goto err;
-
-               *fixed = true;
-
-               if (pos1.snapshot == pos2.p.snapshot) {
-                       /*
-                        * We overwrote the first extent, and did the overwrite
-                        * in the same snapshot:
-                        */
-                       extent_end->offset = bkey_start_offset(&pos2);
-               } else if (pos1.snapshot > pos2.p.snapshot) {
-                       /*
-                        * We overwrote the first extent in pos2's snapshot:
-                        */
-                       ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
-               } else {
-                       /*
-                        * We overwrote the second extent - restart
-                        * check_extent() from the top:
-                        */
-                       ret = bch_err_throw(c, transaction_restart_nested);
-               }
-       }
-fsck_err:
-err:
-       bch2_trans_iter_exit(trans, &iter2);
-       bch2_trans_iter_exit(trans, &iter1);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int check_overlapping_extents(struct btree_trans *trans,
-                             struct snapshots_seen *seen,
-                             struct extent_ends *extent_ends,
-                             struct bkey_s_c k,
-                             struct btree_iter *iter,
-                             bool *fixed)
-{
-       struct bch_fs *c = trans->c;
-       int ret = 0;
-
-       /* transaction restart, running again */
-       if (bpos_eq(extent_ends->last_pos, k.k->p))
-               return 0;
-
-       if (extent_ends->last_pos.inode != k.k->p.inode)
-               extent_ends_reset(extent_ends);
-
-       darray_for_each(extent_ends->e, i) {
-               if (i->offset <= bkey_start_offset(k.k))
-                       continue;
-
-               if (!ref_visible2(c,
-                                 k.k->p.snapshot, seen,
-                                 i->snapshot, &i->seen))
-                       continue;
-
-               ret = overlapping_extents_found(trans, iter->btree_id,
-                                               SPOS(iter->pos.inode,
-                                                    i->offset,
-                                                    i->snapshot),
-                                               &i->seen,
-                                               *k.k, fixed, i);
-               if (ret)
-                       goto err;
-       }
-
-       extent_ends->last_pos = k.k->p;
-err:
-       return ret;
-}
-
-static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
-                               struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       struct bch_extent_crc_unpacked crc;
-       const union bch_extent_entry *i;
-       unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
-
-       bkey_for_each_crc(k.k, ptrs, crc, i)
-               if (crc_is_encoded(crc) &&
-                   crc.uncompressed_size > encoded_extent_max_sectors) {
-                       struct printbuf buf = PRINTBUF;
-
-                       bch2_bkey_val_to_text(&buf, c, k);
-                       bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
-                       printbuf_exit(&buf);
-               }
-
-       return 0;
-}
-
-static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
-                       struct bkey_s_c k,
-                       struct inode_walker *inode,
-                       struct snapshots_seen *s,
-                       struct extent_ends *extent_ends,
-                       struct disk_reservation *res)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       ret = bch2_check_key_has_snapshot(trans, iter, k);
-       if (ret) {
-               ret = ret < 0 ? ret : 0;
-               goto out;
-       }
-
-       if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
-               ret = check_i_sectors(trans, inode);
-               if (ret)
-                       goto err;
-       }
-
-       ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
-       if (ret)
-               goto err;
-
-       struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
-       ret = PTR_ERR_OR_ZERO(extent_i);
-       if (ret)
-               goto err;
-
-       ret = check_key_has_inode(trans, iter, inode, extent_i, k);
-       if (ret)
-               goto err;
-
-       if (k.k->type != KEY_TYPE_whiteout) {
-               ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
-                                               &inode->recalculate_sums);
-               if (ret)
-                       goto err;
-
-               /*
-                * Check inodes in reverse order, from oldest snapshots to
-                * newest, starting from the inode that matches this extent's
-                * snapshot. If we didn't have one, iterate over all inodes:
-                */
-               for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
-                    inode->inodes.data && i >= inode->inodes.data;
-                    --i) {
-                       if (i->inode.bi_snapshot > k.k->p.snapshot ||
-                           !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
-                               continue;
-
-                       u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9;
-
-                       if (fsck_err_on(k.k->p.offset > last_block &&
-                                       !bkey_extent_is_reservation(k),
-                                       trans, extent_past_end_of_inode,
-                                       "extent type past end of inode %llu:%u, i_size %llu\n%s",
-                                       i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
-                                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-                               ret =   snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?:
-                                       bch2_fpunch_snapshot(trans,
-                                                            SPOS(i->inode.bi_inum,
-                                                                 last_block,
-                                                                 i->inode.bi_snapshot),
-                                                            POS(i->inode.bi_inum, U64_MAX));
-                               if (ret)
-                                       goto err;
-
-                               iter->k.type = KEY_TYPE_whiteout;
-                               break;
-                       }
-               }
-       }
-
-       ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
-       if (ret)
-               goto err;
-
-       if (bkey_extent_is_allocation(k.k)) {
-               for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
-                    inode->inodes.data && i >= inode->inodes.data;
-                    --i) {
-                       if (i->whiteout ||
-                           i->inode.bi_snapshot > k.k->p.snapshot ||
-                           !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
-                               continue;
-
-                       i->count += k.k->size;
-               }
-       }
-
-       if (k.k->type != KEY_TYPE_whiteout) {
-               ret = extent_ends_at(c, extent_ends, s, k);
-               if (ret)
-                       goto err;
-       }
-out:
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-/*
- * Walk extents: verify that extents have a corresponding S_ISREG inode, and
- * that i_size an i_sectors are consistent
- */
-int bch2_check_extents(struct bch_fs *c)
-{
-       struct inode_walker w = inode_walker_init();
-       struct snapshots_seen s;
-       struct extent_ends extent_ends;
-       struct disk_reservation res = { 0 };
-
-       snapshots_seen_init(&s);
-       extent_ends_init(&extent_ends);
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key(trans, iter, BTREE_ID_extents,
-                               POS(BCACHEFS_ROOT_INO, 0),
-                               BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
-                       bch2_disk_reservation_put(c, &res);
-                       check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
-                       check_extent_overbig(trans, &iter, k);
-               })) ?:
-               check_i_sectors_notnested(trans, &w));
-
-       bch2_disk_reservation_put(c, &res);
-       extent_ends_exit(&extent_ends);
-       inode_walker_exit(&w);
-       snapshots_seen_exit(&s);
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-int bch2_check_indirect_extents(struct bch_fs *c)
-{
-       struct disk_reservation res = { 0 };
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
-                               POS_MIN,
-                               BTREE_ITER_prefetch, k,
-                               &res, NULL,
-                               BCH_TRANS_COMMIT_no_enospc, ({
-                       bch2_disk_reservation_put(c, &res);
-                       check_extent_overbig(trans, &iter, k);
-               })));
-
-       bch2_disk_reservation_put(c, &res);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
-       struct bch_fs *c = trans->c;
-       int ret = 0;
-       s64 count2;
-
-       darray_for_each(w->inodes, i) {
-               if (i->inode.bi_nlink == i->count)
-                       continue;
-
-               count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot);
-               if (count2 < 0)
-                       return count2;
-
-               if (i->count != count2) {
-                       bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
-                                           w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
-                       i->count = count2;
-                       if (i->inode.bi_nlink == i->count)
-                               continue;
-               }
-
-               if (i->inode.bi_nlink != i->count) {
-                       CLASS(printbuf, buf)();
-
-                       lockrestart_do(trans,
-                                      bch2_inum_snapshot_to_path(trans, w->last_pos.inode,
-                                                                 i->inode.bi_snapshot, NULL, &buf));
-
-                       if (fsck_err_on(i->inode.bi_nlink != i->count,
-                                       trans, inode_dir_wrong_nlink,
-                                       "directory with wrong i_nlink: got %u, should be %llu\n%s",
-                                       i->inode.bi_nlink, i->count, buf.buf)) {
-                               i->inode.bi_nlink = i->count;
-                               ret = bch2_fsck_write_inode(trans, &i->inode);
-                               if (ret)
-                                       break;
-                       }
-               }
-       }
-fsck_err:
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
-{
-       u32 restart_count = trans->restart_count;
-       return check_subdir_count_notnested(trans, w) ?:
-               trans_was_restarted(trans, restart_count);
-}
-
-/* find a subvolume that's a descendent of @snapshot: */
-static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
-               if (k.k->type != KEY_TYPE_subvolume)
-                       continue;
-
-               struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-               if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
-                       bch2_trans_iter_exit(trans, &iter);
-                       *subvolid = k.k->p.offset;
-                       goto found;
-               }
-       }
-       if (!ret)
-               ret = -ENOENT;
-found:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-noinline_for_stack
-static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
-                                 struct bkey_s_c_dirent d)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter subvol_iter = {};
-       struct bch_inode_unpacked subvol_root;
-       u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
-       u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
-       u32 parent_snapshot;
-       u32 new_parent_subvol = 0;
-       u64 parent_inum;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               return ret;
-
-       if (ret ||
-           (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
-               int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
-               if (ret2 && !bch2_err_matches(ret, ENOENT))
-                       return ret2;
-       }
-
-       if (ret &&
-           !new_parent_subvol &&
-           (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
-               /*
-                * Couldn't find a subvol for dirent's snapshot - but we lost
-                * subvols, so we need to reconstruct:
-                */
-               ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
-               if (ret)
-                       return ret;
-
-               parent_snapshot = d.k->p.snapshot;
-       }
-
-       if (fsck_err_on(ret,
-                       trans, dirent_to_missing_parent_subvol,
-                       "dirent parent_subvol points to missing subvolume\n%s",
-                       (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
-           fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
-                       trans, dirent_not_visible_in_parent_subvol,
-                       "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
-                       parent_snapshot,
-                       (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
-               if (!new_parent_subvol) {
-                       bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
-                       return bch_err_throw(c, fsck_repair_unimplemented);
-               }
-
-               struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
-               ret = PTR_ERR_OR_ZERO(new_dirent);
-               if (ret)
-                       goto err;
-
-               new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
-       }
-
-       struct bkey_s_c_subvolume s =
-               bch2_bkey_get_iter_typed(trans, &subvol_iter,
-                                        BTREE_ID_subvolumes, POS(0, target_subvol),
-                                        0, subvolume);
-       ret = bkey_err(s.s_c);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               goto err;
-
-       if (ret) {
-               if (fsck_err(trans, dirent_to_missing_subvol,
-                            "dirent points to missing subvolume\n%s",
-                            (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
-                       return bch2_fsck_remove_dirent(trans, d.k->p);
-               ret = 0;
-               goto out;
-       }
-
-       if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) {
-               printbuf_reset(&buf);
-
-               prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n",
-                          parent_subvol);
-
-               ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset,
-                                       le64_to_cpu(s.v->inode) }, &buf);
-               if (ret)
-                       goto err;
-               prt_newline(&buf);
-               bch2_bkey_val_to_text(&buf, c, s.s_c);
-
-               if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) {
-                       struct bkey_i_subvolume *n =
-                               bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
-                       ret = PTR_ERR_OR_ZERO(n);
-                       if (ret)
-                               goto err;
-
-                       n->v.fs_path_parent = cpu_to_le32(parent_subvol);
-               }
-       }
-
-       u64 target_inum = le64_to_cpu(s.v->inode);
-       u32 target_snapshot = le32_to_cpu(s.v->snapshot);
-
-       ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot,
-                                              &subvol_root, 0);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               goto err;
-
-       if (ret) {
-               bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
-               ret = bch_err_throw(c, fsck_repair_unimplemented);
-               goto err;
-       }
-
-       if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
-                       trans, inode_bi_parent_wrong,
-                       "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
-                       target_inum,
-                       subvol_root.bi_parent_subvol, parent_subvol)) {
-               subvol_root.bi_parent_subvol = parent_subvol;
-               subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
-               ret = __bch2_fsck_write_inode(trans, &subvol_root);
-               if (ret)
-                       goto err;
-       }
-
-       ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
-       if (ret)
-               goto err;
-out:
-err:
-fsck_err:
-       bch2_trans_iter_exit(trans, &subvol_iter);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
-                       struct bkey_s_c k,
-                       struct bch_hash_info *hash_info,
-                       struct inode_walker *dir,
-                       struct inode_walker *target,
-                       struct snapshots_seen *s,
-                       bool *need_second_pass)
-{
-       struct bch_fs *c = trans->c;
-       struct inode_walker_entry *i;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       ret = bch2_check_key_has_snapshot(trans, iter, k);
-       if (ret) {
-               ret = ret < 0 ? ret : 0;
-               goto out;
-       }
-
-       ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
-       if (ret)
-               goto err;
-
-       if (k.k->type == KEY_TYPE_whiteout)
-               goto out;
-
-       if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
-               ret = check_subdir_dirents_count(trans, dir);
-               if (ret)
-                       goto err;
-       }
-
-       i = walk_inode(trans, dir, k);
-       ret = PTR_ERR_OR_ZERO(i);
-       if (ret < 0)
-               goto err;
-
-       ret = check_key_has_inode(trans, iter, dir, i, k);
-       if (ret)
-               goto err;
-
-       if (!i || i->whiteout)
-               goto out;
-
-       if (dir->first_this_inode)
-               *hash_info = bch2_hash_info_init(c, &i->inode);
-       dir->first_this_inode = false;
-
-       hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL;
-
-       ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info,
-                                     iter, k, need_second_pass);
-       if (ret < 0)
-               goto err;
-       if (ret) {
-               /* dirent has been deleted */
-               ret = 0;
-               goto out;
-       }
-
-       if (k.k->type != KEY_TYPE_dirent)
-               goto out;
-
-       struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-       /* check casefold */
-       if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding,
-                       trans, dirent_casefold_mismatch,
-                       "dirent casefold does not match dir casefold\n%s",
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, k),
-                        buf.buf))) {
-               subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL
-                               ? le32_to_cpu(d.v->d_parent_subvol)
-                               : 0,
-               };
-               u64 target = d.v->d_type == DT_SUBVOL
-                       ? le32_to_cpu(d.v->d_child_subvol)
-                       : le64_to_cpu(d.v->d_inum);
-               struct qstr name = bch2_dirent_get_name(d);
-
-               struct bkey_i_dirent *new_d =
-                       bch2_dirent_create_key(trans, hash_info, dir_inum,
-                                              d.v->d_type, &name, NULL, target);
-               ret = PTR_ERR_OR_ZERO(new_d);
-               if (ret)
-                       goto out;
-
-               new_d->k.p.inode        = d.k->p.inode;
-               new_d->k.p.snapshot     = d.k->p.snapshot;
-
-               struct btree_iter dup_iter = {};
-               ret =   bch2_hash_delete_at(trans,
-                                           bch2_dirent_hash_desc, hash_info, iter,
-                                           BTREE_UPDATE_internal_snapshot_node) ?:
-                       bch2_str_hash_repair_key(trans, s,
-                                                &bch2_dirent_hash_desc, hash_info,
-                                                iter, bkey_i_to_s_c(&new_d->k_i),
-                                                &dup_iter, bkey_s_c_null,
-                                                need_second_pass);
-               goto out;
-       }
-
-       if (d.v->d_type == DT_SUBVOL) {
-               ret = check_dirent_to_subvol(trans, iter, d);
-               if (ret)
-                       goto err;
-       } else {
-               ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
-               if (ret)
-                       goto err;
-
-               if (fsck_err_on(!target->inodes.nr,
-                               trans, dirent_to_missing_inode,
-                               "dirent points to missing inode:\n%s",
-                               (printbuf_reset(&buf),
-                                bch2_bkey_val_to_text(&buf, c, k),
-                                buf.buf))) {
-                       ret = bch2_fsck_remove_dirent(trans, d.k->p);
-                       if (ret)
-                               goto err;
-               }
-
-               darray_for_each(target->inodes, i) {
-                       ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
-                       if (ret)
-                               goto err;
-               }
-
-               darray_for_each(target->deletes, i)
-                       if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
-                                       trans, dirent_to_overwritten_inode,
-                                       "dirent points to inode overwritten in snapshot %u:\n%s",
-                                       *i,
-                                       (printbuf_reset(&buf),
-                                        bch2_bkey_val_to_text(&buf, c, k),
-                                        buf.buf))) {
-                               struct btree_iter delete_iter;
-                               bch2_trans_iter_init(trans, &delete_iter,
-                                                    BTREE_ID_dirents,
-                                                    SPOS(k.k->p.inode, k.k->p.offset, *i),
-                                                    BTREE_ITER_intent);
-                               ret =   bch2_btree_iter_traverse(trans, &delete_iter) ?:
-                                       bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-                                                         hash_info,
-                                                         &delete_iter,
-                                                         BTREE_UPDATE_internal_snapshot_node);
-                               bch2_trans_iter_exit(trans, &delete_iter);
-                               if (ret)
-                                       goto err;
-
-                       }
-       }
-
-       ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-       if (ret)
-               goto err;
-
-       for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
-               if (d.v->d_type == DT_DIR)
-                       i->count++;
-               i->i_size += bkey_bytes(d.k);
-       }
-out:
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-/*
- * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
- * validate d_type
- */
-int bch2_check_dirents(struct bch_fs *c)
-{
-       struct inode_walker dir = inode_walker_init();
-       struct inode_walker target = inode_walker_init();
-       struct snapshots_seen s;
-       struct bch_hash_info hash_info;
-       bool need_second_pass = false, did_second_pass = false;
-       int ret;
-
-       snapshots_seen_init(&s);
-again:
-       ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
-                               POS(BCACHEFS_ROOT_INO, 0),
-                               BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s,
-                                    &need_second_pass)) ?:
-               check_subdir_count_notnested(trans, &dir));
-
-       if (!ret && need_second_pass && !did_second_pass) {
-               bch_info(c, "check_dirents requires second pass");
-               swap(did_second_pass, need_second_pass);
-               goto again;
-       }
-
-       if (!ret && need_second_pass) {
-               bch_err(c, "dirents not repairing");
-               ret = -EINVAL;
-       }
-
-       snapshots_seen_exit(&s);
-       inode_walker_exit(&dir);
-       inode_walker_exit(&target);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
-                      struct bkey_s_c k,
-                      struct bch_hash_info *hash_info,
-                      struct inode_walker *inode)
-{
-       struct bch_fs *c = trans->c;
-
-       int ret = bch2_check_key_has_snapshot(trans, iter, k);
-       if (ret < 0)
-               return ret;
-       if (ret)
-               return 0;
-
-       struct inode_walker_entry *i = walk_inode(trans, inode, k);
-       ret = PTR_ERR_OR_ZERO(i);
-       if (ret)
-               return ret;
-
-       ret = check_key_has_inode(trans, iter, inode, i, k);
-       if (ret)
-               return ret;
-
-       if (!i || i->whiteout)
-               return 0;
-
-       if (inode->first_this_inode)
-               *hash_info = bch2_hash_info_init(c, &i->inode);
-       inode->first_this_inode = false;
-
-       bool need_second_pass = false;
-       return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info,
-                                     iter, k, &need_second_pass);
-}
-
-/*
- * Walk xattrs: verify that they all have a corresponding inode
- */
-int bch2_check_xattrs(struct bch_fs *c)
-{
-       struct inode_walker inode = inode_walker_init();
-       struct bch_hash_info hash_info;
-       int ret = 0;
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
-                       POS(BCACHEFS_ROOT_INO, 0),
-                       BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
-                       k,
-                       NULL, NULL,
-                       BCH_TRANS_COMMIT_no_enospc,
-               check_xattr(trans, &iter, k, &hash_info, &inode)));
-
-       inode_walker_exit(&inode);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int check_root_trans(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_inode_unpacked root_inode;
-       u32 snapshot;
-       u64 inum;
-       int ret;
-
-       ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               return ret;
-
-       if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
-                               "root subvol missing")) {
-               struct bkey_i_subvolume *root_subvol =
-                       bch2_trans_kmalloc(trans, sizeof(*root_subvol));
-               ret = PTR_ERR_OR_ZERO(root_subvol);
-               if (ret)
-                       goto err;
-
-               snapshot        = U32_MAX;
-               inum            = BCACHEFS_ROOT_INO;
-
-               bkey_subvolume_init(&root_subvol->k_i);
-               root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
-               root_subvol->v.flags    = 0;
-               root_subvol->v.snapshot = cpu_to_le32(snapshot);
-               root_subvol->v.inode    = cpu_to_le64(inum);
-               ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
-               bch_err_msg(c, ret, "writing root subvol");
-               if (ret)
-                       goto err;
-       }
-
-       ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot,
-                                              &root_inode, 0);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               return ret;
-
-       if (mustfix_fsck_err_on(ret,
-                               trans, root_dir_missing,
-                               "root directory missing") ||
-           mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
-                               trans, root_inode_not_dir,
-                               "root inode not a directory")) {
-               bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
-                               0, NULL);
-               root_inode.bi_inum = inum;
-               root_inode.bi_snapshot = snapshot;
-
-               ret = __bch2_fsck_write_inode(trans, &root_inode);
-               bch_err_msg(c, ret, "writing root inode");
-       }
-err:
-fsck_err:
-       return ret;
-}
-
-/* Get root directory, create if it doesn't exist: */
-int bch2_check_root(struct bch_fs *c)
-{
-       int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-               check_root_trans(trans));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static bool darray_u32_has(darray_u32 *d, u32 v)
-{
-       darray_for_each(*d, i)
-               if (*i == v)
-                       return true;
-       return false;
-}
-
-static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter parent_iter = {};
-       darray_u32 subvol_path = {};
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       if (k.k->type != KEY_TYPE_subvolume)
-               return 0;
-
-       subvol_inum start = {
-               .subvol = k.k->p.offset,
-               .inum   = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode),
-       };
-
-       while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
-               ret = darray_push(&subvol_path, k.k->p.offset);
-               if (ret)
-                       goto err;
-
-               struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-
-               struct bch_inode_unpacked subvol_root;
-               ret = bch2_inode_find_by_inum_trans(trans,
-                                       (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
-                                       &subvol_root);
-               if (ret)
-                       break;
-
-               u32 parent = le32_to_cpu(s.v->fs_path_parent);
-
-               if (darray_u32_has(&subvol_path, parent)) {
-                       printbuf_reset(&buf);
-                       prt_printf(&buf, "subvolume loop: ");
-
-                       ret = bch2_inum_to_path(trans, start, &buf);
-                       if (ret)
-                               goto err;
-
-                       if (fsck_err(trans, subvol_loop, "%s", buf.buf))
-                               ret = reattach_subvol(trans, s);
-                       break;
-               }
-
-               bch2_trans_iter_exit(trans, &parent_iter);
-               bch2_trans_iter_init(trans, &parent_iter,
-                                    BTREE_ID_subvolumes, POS(0, parent), 0);
-               k = bch2_btree_iter_peek_slot(trans, &parent_iter);
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
-                               trans, subvol_unreachable,
-                               "unreachable subvolume %s",
-                               (printbuf_reset(&buf),
-                                bch2_bkey_val_to_text(&buf, c, s.s_c),
-                                buf.buf))) {
-                       ret = reattach_subvol(trans, s);
-                       break;
-               }
-       }
-fsck_err:
-err:
-       printbuf_exit(&buf);
-       darray_exit(&subvol_path);
-       bch2_trans_iter_exit(trans, &parent_iter);
-       return ret;
-}
-
-int bch2_check_subvolume_structure(struct bch_fs *c)
-{
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter,
-                               BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       check_subvol_path(trans, &iter, k)));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int bch2_bi_depth_renumber_one(struct btree_trans *trans,
-                                     u64 inum, u32 snapshot,
-                                     u32 new_depth)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-                                              SPOS(0, inum, snapshot), 0);
-
-       struct bch_inode_unpacked inode;
-       int ret = bkey_err(k) ?:
-               !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
-               : bch2_inode_unpack(k, &inode);
-       if (ret)
-               goto err;
-
-       if (inode.bi_depth != new_depth) {
-               inode.bi_depth = new_depth;
-               ret = __bch2_fsck_write_inode(trans, &inode) ?:
-                       bch2_trans_commit(trans, NULL, NULL, 0);
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path,
-                                 u32 snapshot, u32 new_bi_depth)
-{
-       u32 restart_count = trans->restart_count;
-       int ret = 0;
-
-       darray_for_each_reverse(*path, i) {
-               ret = nested_lockrestart_do(trans,
-                               bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth));
-               bch_err_fn(trans->c, ret);
-               if (ret)
-                       break;
-
-               new_bi_depth++;
-       }
-
-       return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter inode_iter = {};
-       darray_u64 path = {};
-       struct printbuf buf = PRINTBUF;
-       u32 snapshot = inode_k.k->p.snapshot;
-       bool redo_bi_depth = false;
-       u32 min_bi_depth = U32_MAX;
-       int ret = 0;
-
-       struct bpos start = inode_k.k->p;
-
-       struct bch_inode_unpacked inode;
-       ret = bch2_inode_unpack(inode_k, &inode);
-       if (ret)
-               return ret;
-
-       /*
-        * If we're running full fsck, check_dirents() will have already ran,
-        * and we shouldn't see any missing backpointers here - otherwise that's
-        * handled separately, by check_unreachable_inodes
-        */
-       while (!inode.bi_subvol &&
-              bch2_inode_has_backpointer(&inode)) {
-               struct btree_iter dirent_iter;
-               struct bkey_s_c_dirent d;
-
-               d = dirent_get_by_pos(trans, &dirent_iter,
-                                     SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot));
-               ret = bkey_err(d.s_c);
-               if (ret && !bch2_err_matches(ret, ENOENT))
-                       goto out;
-
-               if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
-                       bch2_trans_iter_exit(trans, &dirent_iter);
-
-               if (bch2_err_matches(ret, ENOENT)) {
-                       printbuf_reset(&buf);
-                       bch2_bkey_val_to_text(&buf, c, inode_k);
-                       bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
-                               bch2_err_str(ret), buf.buf);
-                       goto out;
-               }
-
-               bch2_trans_iter_exit(trans, &dirent_iter);
-
-               ret = darray_push(&path, inode.bi_inum);
-               if (ret)
-                       return ret;
-
-               bch2_trans_iter_exit(trans, &inode_iter);
-               inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
-                                            SPOS(0, inode.bi_dir, snapshot), 0);
-
-               struct bch_inode_unpacked parent_inode;
-               ret = bkey_err(inode_k) ?:
-                       !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
-                       : bch2_inode_unpack(inode_k, &parent_inode);
-               if (ret) {
-                       /* Should have been caught in dirents pass */
-                       bch_err_msg(c, ret, "error looking up parent directory");
-                       goto out;
-               }
-
-               min_bi_depth = parent_inode.bi_depth;
-
-               if (parent_inode.bi_depth < inode.bi_depth &&
-                   min_bi_depth < U16_MAX)
-                       break;
-
-               inode = parent_inode;
-               redo_bi_depth = true;
-
-               if (darray_find(path, inode.bi_inum)) {
-                       printbuf_reset(&buf);
-                       prt_printf(&buf, "directory structure loop in snapshot %u: ",
-                                  snapshot);
-
-                       ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf);
-                       if (ret)
-                               goto out;
-
-                       if (c->opts.verbose) {
-                               prt_newline(&buf);
-                               darray_for_each(path, i)
-                                       prt_printf(&buf, "%llu ", *i);
-                       }
-
-                       if (fsck_err(trans, dir_loop, "%s", buf.buf)) {
-                               ret = remove_backpointer(trans, &inode);
-                               bch_err_msg(c, ret, "removing dirent");
-                               if (ret)
-                                       goto out;
-
-                               ret = reattach_inode(trans, &inode);
-                               bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
-                       }
-
-                       goto out;
-               }
-       }
-
-       if (inode.bi_subvol)
-               min_bi_depth = 0;
-
-       if (redo_bi_depth)
-               ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth);
-out:
-fsck_err:
-       bch2_trans_iter_exit(trans, &inode_iter);
-       darray_exit(&path);
-       printbuf_exit(&buf);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-/*
- * Check for loops in the directory structure: all other connectivity issues
- * have been fixed by prior passes
- */
-int bch2_check_directory_structure(struct bch_fs *c)
-{
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
-                                         BTREE_ITER_intent|
-                                         BTREE_ITER_prefetch|
-                                         BTREE_ITER_all_snapshots, k,
-                                         NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-                       if (!S_ISDIR(bkey_inode_mode(k)))
-                               continue;
-
-                       if (bch2_inode_flags(k) & BCH_INODE_unlinked)
-                               continue;
-
-                       check_path_loop(trans, k);
-               })));
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-struct nlink_table {
-       size_t          nr;
-       size_t          size;
-
-       struct nlink {
-               u64     inum;
-               u32     snapshot;
-               u32     count;
-       }               *d;
-};
-
-static int add_nlink(struct bch_fs *c, struct nlink_table *t,
-                    u64 inum, u32 snapshot)
-{
-       if (t->nr == t->size) {
-               size_t new_size = max_t(size_t, 128UL, t->size * 2);
-               void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
-
-               if (!d) {
-                       bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
-                               new_size);
-                       return bch_err_throw(c, ENOMEM_fsck_add_nlink);
-               }
-
-               if (t->d)
-                       memcpy(d, t->d, t->size * sizeof(t->d[0]));
-               kvfree(t->d);
-
-               t->d = d;
-               t->size = new_size;
-       }
-
-
-       t->d[t->nr++] = (struct nlink) {
-               .inum           = inum,
-               .snapshot       = snapshot,
-       };
-
-       return 0;
-}
-
-static int nlink_cmp(const void *_l, const void *_r)
-{
-       const struct nlink *l = _l;
-       const struct nlink *r = _r;
-
-       return cmp_int(l->inum, r->inum);
-}
-
-static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
-                    struct nlink_table *links,
-                    u64 range_start, u64 range_end, u64 inum, u32 snapshot)
-{
-       struct nlink *link, key = {
-               .inum = inum, .snapshot = U32_MAX,
-       };
-
-       if (inum < range_start || inum >= range_end)
-               return;
-
-       link = __inline_bsearch(&key, links->d, links->nr,
-                               sizeof(links->d[0]), nlink_cmp);
-       if (!link)
-               return;
-
-       while (link > links->d && link[0].inum == link[-1].inum)
-               --link;
-
-       for (; link < links->d + links->nr && link->inum == inum; link++)
-               if (ref_visible(c, s, snapshot, link->snapshot)) {
-                       link->count++;
-                       if (link->snapshot >= snapshot)
-                               break;
-               }
-}
-
-noinline_for_stack
-static int check_nlinks_find_hardlinks(struct bch_fs *c,
-                                      struct nlink_table *t,
-                                      u64 start, u64 *end)
-{
-       int ret = bch2_trans_run(c,
-               for_each_btree_key(trans, iter, BTREE_ID_inodes,
-                                  POS(0, start),
-                                  BTREE_ITER_intent|
-                                  BTREE_ITER_prefetch|
-                                  BTREE_ITER_all_snapshots, k, ({
-                       if (!bkey_is_inode(k.k))
-                               continue;
-
-                       /* Should never fail, checked by bch2_inode_invalid: */
-                       struct bch_inode_unpacked u;
-                       _ret3 = bch2_inode_unpack(k, &u);
-                       if (_ret3)
-                               break;
-
-                       /*
-                        * Backpointer and directory structure checks are sufficient for
-                        * directories, since they can't have hardlinks:
-                        */
-                       if (S_ISDIR(u.bi_mode))
-                               continue;
-
-                       /*
-                        * Previous passes ensured that bi_nlink is nonzero if
-                        * it had multiple hardlinks:
-                        */
-                       if (!u.bi_nlink)
-                               continue;
-
-                       ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
-                       if (ret) {
-                               *end = k.k->p.offset;
-                               ret = 0;
-                               break;
-                       }
-                       0;
-               })));
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-noinline_for_stack
-static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
-                                    u64 range_start, u64 range_end)
-{
-       struct snapshots_seen s;
-
-       snapshots_seen_init(&s);
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
-                                  BTREE_ITER_intent|
-                                  BTREE_ITER_prefetch|
-                                  BTREE_ITER_all_snapshots, k, ({
-                       ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
-                       if (ret)
-                               break;
-
-                       if (k.k->type == KEY_TYPE_dirent) {
-                               struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-                               if (d.v->d_type != DT_DIR &&
-                                   d.v->d_type != DT_SUBVOL)
-                                       inc_link(c, &s, links, range_start, range_end,
-                                                le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
-                       }
-                       0;
-               })));
-
-       snapshots_seen_exit(&s);
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
-                                    struct bkey_s_c k,
-                                    struct nlink_table *links,
-                                    size_t *idx, u64 range_end)
-{
-       struct bch_inode_unpacked u;
-       struct nlink *link = &links->d[*idx];
-       int ret = 0;
-
-       if (k.k->p.offset >= range_end)
-               return 1;
-
-       if (!bkey_is_inode(k.k))
-               return 0;
-
-       ret = bch2_inode_unpack(k, &u);
-       if (ret)
-               return ret;
-
-       if (S_ISDIR(u.bi_mode))
-               return 0;
-
-       if (!u.bi_nlink)
-               return 0;
-
-       while ((cmp_int(link->inum, k.k->p.offset) ?:
-               cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
-               BUG_ON(*idx == links->nr);
-               link = &links->d[++*idx];
-       }
-
-       if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
-                       trans, inode_wrong_nlink,
-                       "inode %llu type %s has wrong i_nlink (%u, should be %u)",
-                       u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
-                       bch2_inode_nlink_get(&u), link->count)) {
-               bch2_inode_nlink_set(&u, link->count);
-               ret = __bch2_fsck_write_inode(trans, &u);
-       }
-fsck_err:
-       return ret;
-}
-
-noinline_for_stack
-static int check_nlinks_update_hardlinks(struct bch_fs *c,
-                              struct nlink_table *links,
-                              u64 range_start, u64 range_end)
-{
-       size_t idx = 0;
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
-                               POS(0, range_start),
-                               BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
-       if (ret < 0) {
-               bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
-               return ret;
-       }
-
-       return 0;
-}
-
-int bch2_check_nlinks(struct bch_fs *c)
-{
-       struct nlink_table links = { 0 };
-       u64 this_iter_range_start, next_iter_range_start = 0;
-       int ret = 0;
-
-       do {
-               this_iter_range_start = next_iter_range_start;
-               next_iter_range_start = U64_MAX;
-
-               ret = check_nlinks_find_hardlinks(c, &links,
-                                                 this_iter_range_start,
-                                                 &next_iter_range_start);
-
-               ret = check_nlinks_walk_dirents(c, &links,
-                                         this_iter_range_start,
-                                         next_iter_range_start);
-               if (ret)
-                       break;
-
-               ret = check_nlinks_update_hardlinks(c, &links,
-                                        this_iter_range_start,
-                                        next_iter_range_start);
-               if (ret)
-                       break;
-
-               links.nr = 0;
-       } while (next_iter_range_start != U64_MAX);
-
-       kvfree(links.d);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
-                            struct bkey_s_c k)
-{
-       struct bkey_s_c_reflink_p p;
-       struct bkey_i_reflink_p *u;
-
-       if (k.k->type != KEY_TYPE_reflink_p)
-               return 0;
-
-       p = bkey_s_c_to_reflink_p(k);
-
-       if (!p.v->front_pad && !p.v->back_pad)
-               return 0;
-
-       u = bch2_trans_kmalloc(trans, sizeof(*u));
-       int ret = PTR_ERR_OR_ZERO(u);
-       if (ret)
-               return ret;
-
-       bkey_reassemble(&u->k_i, k);
-       u->v.front_pad  = 0;
-       u->v.back_pad   = 0;
-
-       return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
-}
-
-int bch2_fix_reflink_p(struct bch_fs *c)
-{
-       if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
-               return 0;
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter,
-                               BTREE_ID_extents, POS_MIN,
-                               BTREE_ITER_intent|BTREE_ITER_prefetch|
-                               BTREE_ITER_all_snapshots, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       fix_reflink_p_key(trans, &iter, k)));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-#ifndef NO_BCACHEFS_CHARDEV
-
-struct fsck_thread {
-       struct thread_with_stdio thr;
-       struct bch_fs           *c;
-       struct bch_opts         opts;
-};
-
-static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
-{
-       struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
-       kfree(thr);
-}
-
-static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
-{
-       struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
-       struct bch_fs *c = thr->c;
-
-       int ret = PTR_ERR_OR_ZERO(c);
-       if (ret)
-               return ret;
-
-       ret = bch2_fs_start(thr->c);
-       if (ret)
-               goto err;
-
-       if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
-               bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
-               ret |= 1;
-       }
-       if (test_bit(BCH_FS_error, &c->flags)) {
-               bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
-               ret |= 4;
-       }
-err:
-       bch2_fs_stop(c);
-       return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
-       .exit           = bch2_fsck_thread_exit,
-       .fn             = bch2_fsck_offline_thread_fn,
-};
-
-long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
-{
-       struct bch_ioctl_fsck_offline arg;
-       struct fsck_thread *thr = NULL;
-       darray_const_str devs = {};
-       long ret = 0;
-
-       if (copy_from_user(&arg, user_arg, sizeof(arg)))
-               return -EFAULT;
-
-       if (arg.flags)
-               return -EINVAL;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       for (size_t i = 0; i < arg.nr_devs; i++) {
-               u64 dev_u64;
-               ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
-               if (ret)
-                       goto err;
-
-               char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
-               ret = PTR_ERR_OR_ZERO(dev_str);
-               if (ret)
-                       goto err;
-
-               ret = darray_push(&devs, dev_str);
-               if (ret) {
-                       kfree(dev_str);
-                       goto err;
-               }
-       }
-
-       thr = kzalloc(sizeof(*thr), GFP_KERNEL);
-       if (!thr) {
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       thr->opts = bch2_opts_empty();
-
-       if (arg.opts) {
-               char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-               ret =   PTR_ERR_OR_ZERO(optstr) ?:
-                       bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false);
-               if (!IS_ERR(optstr))
-                       kfree(optstr);
-
-               if (ret)
-                       goto err;
-       }
-
-       opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
-       opt_set(thr->opts, read_only, 1);
-       opt_set(thr->opts, ratelimit_errors, 0);
-
-       /* We need request_key() to be called before we punt to kthread: */
-       opt_set(thr->opts, nostart, true);
-
-       bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
-
-       thr->c = bch2_fs_open(&devs, &thr->opts);
-
-       if (!IS_ERR(thr->c) &&
-           thr->c->opts.errors == BCH_ON_ERROR_panic)
-               thr->c->opts.errors = BCH_ON_ERROR_ro;
-
-       ret = __bch2_run_thread_with_stdio(&thr->thr);
-out:
-       darray_for_each(devs, i)
-               kfree(*i);
-       darray_exit(&devs);
-       return ret;
-err:
-       if (thr)
-               bch2_fsck_thread_exit(&thr->thr);
-       pr_err("ret %s", bch2_err_str(ret));
-       goto out;
-}
-
-static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
-{
-       struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
-       struct bch_fs *c = thr->c;
-
-       c->stdio_filter = current;
-       c->stdio = &thr->thr.stdio;
-
-       /*
-        * XXX: can we figure out a way to do this without mucking with c->opts?
-        */
-       unsigned old_fix_errors = c->opts.fix_errors;
-       if (opt_defined(thr->opts, fix_errors))
-               c->opts.fix_errors = thr->opts.fix_errors;
-       else
-               c->opts.fix_errors = FSCK_FIX_ask;
-
-       c->opts.fsck = true;
-       set_bit(BCH_FS_in_fsck, &c->flags);
-
-       int ret = bch2_run_online_recovery_passes(c, ~0ULL);
-
-       clear_bit(BCH_FS_in_fsck, &c->flags);
-       bch_err_fn(c, ret);
-
-       c->stdio = NULL;
-       c->stdio_filter = NULL;
-       c->opts.fix_errors = old_fix_errors;
-
-       up(&c->recovery.run_lock);
-       bch2_ro_ref_put(c);
-       return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
-       .exit           = bch2_fsck_thread_exit,
-       .fn             = bch2_fsck_online_thread_fn,
-};
-
-long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
-{
-       struct fsck_thread *thr = NULL;
-       long ret = 0;
-
-       if (arg.flags)
-               return -EINVAL;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if (!bch2_ro_ref_tryget(c))
-               return -EROFS;
-
-       if (down_trylock(&c->recovery.run_lock)) {
-               bch2_ro_ref_put(c);
-               return -EAGAIN;
-       }
-
-       thr = kzalloc(sizeof(*thr), GFP_KERNEL);
-       if (!thr) {
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       thr->c = c;
-       thr->opts = bch2_opts_empty();
-
-       if (arg.opts) {
-               char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
-               ret =   PTR_ERR_OR_ZERO(optstr) ?:
-                       bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false);
-               if (!IS_ERR(optstr))
-                       kfree(optstr);
-
-               if (ret)
-                       goto err;
-       }
-
-       ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
-err:
-       if (ret < 0) {
-               bch_err_fn(c, ret);
-               if (thr)
-                       bch2_fsck_thread_exit(&thr->thr);
-               up(&c->recovery.run_lock);
-               bch2_ro_ref_put(c);
-       }
-       return ret;
-}
-
-#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
deleted file mode 100644 (file)
index e5fe7cf..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FSCK_H
-#define _BCACHEFS_FSCK_H
-
-#include "str_hash.h"
-
-/* recoverds snapshot IDs of overwrites at @pos */
-struct snapshots_seen {
-       struct bpos                     pos;
-       snapshot_id_list                ids;
-};
-
-int bch2_fsck_update_backpointers(struct btree_trans *,
-                                 struct snapshots_seen *,
-                                 const struct bch_hash_desc,
-                                 struct bch_hash_info *,
-                                 struct bkey_i *);
-
-int bch2_check_inodes(struct bch_fs *);
-int bch2_check_extents(struct bch_fs *);
-int bch2_check_indirect_extents(struct bch_fs *);
-int bch2_check_dirents(struct bch_fs *);
-int bch2_check_xattrs(struct bch_fs *);
-int bch2_check_root(struct bch_fs *);
-int bch2_check_subvolume_structure(struct bch_fs *);
-int bch2_check_unreachable_inodes(struct bch_fs *);
-int bch2_check_directory_structure(struct bch_fs *);
-int bch2_check_nlinks(struct bch_fs *);
-int bch2_fix_reflink_p(struct bch_fs *);
-
-long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *);
-long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online);
-
-#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
deleted file mode 100644 (file)
index ef4cc73..0000000
+++ /dev/null
@@ -1,1566 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_write_buffer.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "compress.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "fs.h"
-#include "inode.h"
-#include "namei.h"
-#include "opts.h"
-#include "str_hash.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "varint.h"
-
-#include <linux/random.h>
-
-#include <linux/unaligned.h>
-
-#define x(name, ...)   #name,
-const char * const bch2_inode_opts[] = {
-       BCH_INODE_OPTS()
-       NULL,
-};
-
-static const char * const bch2_inode_flag_strs[] = {
-       BCH_INODE_FLAGS()
-       NULL
-};
-#undef  x
-
-static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
-static int may_delete_deleted_inum(struct btree_trans *, subvol_inum);
-
-static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-
-static int inode_decode_field(const u8 *in, const u8 *end,
-                             u64 out[2], unsigned *out_bits)
-{
-       __be64 be[2] = { 0, 0 };
-       unsigned bytes, shift;
-       u8 *p;
-
-       if (in >= end)
-               return -BCH_ERR_inode_unpack_error;
-
-       if (!*in)
-               return -BCH_ERR_inode_unpack_error;
-
-       /*
-        * position of highest set bit indicates number of bytes:
-        * shift = number of bits to remove in high byte:
-        */
-       shift   = 8 - __fls(*in); /* 1 <= shift <= 8 */
-       bytes   = byte_table[shift - 1];
-
-       if (in + bytes > end)
-               return -BCH_ERR_inode_unpack_error;
-
-       p = (u8 *) be + 16 - bytes;
-       memcpy(p, in, bytes);
-       *p ^= (1 << 8) >> shift;
-
-       out[0] = be64_to_cpu(be[0]);
-       out[1] = be64_to_cpu(be[1]);
-       *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
-
-       return bytes;
-}
-
-static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
-                                          const struct bch_inode_unpacked *inode)
-{
-       struct bkey_i_inode_v3 *k = &packed->inode;
-       u8 *out = k->v.fields;
-       u8 *end = (void *) &packed[1];
-       u8 *last_nonzero_field = out;
-       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
-       unsigned bytes;
-       int ret;
-
-       bkey_inode_v3_init(&packed->inode.k_i);
-       packed->inode.k.p.offset        = inode->bi_inum;
-       packed->inode.v.bi_journal_seq  = cpu_to_le64(inode->bi_journal_seq);
-       packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
-       packed->inode.v.bi_flags        = cpu_to_le64(inode->bi_flags);
-       packed->inode.v.bi_sectors      = cpu_to_le64(inode->bi_sectors);
-       packed->inode.v.bi_size         = cpu_to_le64(inode->bi_size);
-       packed->inode.v.bi_version      = cpu_to_le64(inode->bi_version);
-       SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
-       SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
-
-
-#define x(_name, _bits)                                                        \
-       nr_fields++;                                                    \
-                                                                       \
-       if (inode->_name) {                                             \
-               ret = bch2_varint_encode_fast(out, inode->_name);       \
-               out += ret;                                             \
-                                                                       \
-               if (_bits > 64)                                         \
-                       *out++ = 0;                                     \
-                                                                       \
-               last_nonzero_field = out;                               \
-               last_nonzero_fieldnr = nr_fields;                       \
-       } else {                                                        \
-               *out++ = 0;                                             \
-                                                                       \
-               if (_bits > 64)                                         \
-                       *out++ = 0;                                     \
-       }
-
-       BCH_INODE_FIELDS_v3()
-#undef  x
-       BUG_ON(out > end);
-
-       out = last_nonzero_field;
-       nr_fields = last_nonzero_fieldnr;
-
-       bytes = out - (u8 *) &packed->inode.v;
-       set_bkey_val_bytes(&packed->inode.k, bytes);
-       memset_u64s_tail(&packed->inode.v, 0, bytes);
-
-       SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
-
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-               struct bch_inode_unpacked unpacked;
-
-               ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
-               BUG_ON(ret);
-               BUG_ON(unpacked.bi_inum         != inode->bi_inum);
-               BUG_ON(unpacked.bi_hash_seed    != inode->bi_hash_seed);
-               BUG_ON(unpacked.bi_sectors      != inode->bi_sectors);
-               BUG_ON(unpacked.bi_size         != inode->bi_size);
-               BUG_ON(unpacked.bi_version      != inode->bi_version);
-               BUG_ON(unpacked.bi_mode         != inode->bi_mode);
-
-#define x(_name, _bits)        if (unpacked._name != inode->_name)             \
-                       panic("unpacked %llu should be %llu",           \
-                             (u64) unpacked._name, (u64) inode->_name);
-               BCH_INODE_FIELDS_v3()
-#undef  x
-       }
-}
-
-void bch2_inode_pack(struct bkey_inode_buf *packed,
-                    const struct bch_inode_unpacked *inode)
-{
-       bch2_inode_pack_inlined(packed, inode);
-}
-
-static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
-                               struct bch_inode_unpacked *unpacked)
-{
-       const u8 *in = inode.v->fields;
-       const u8 *end = bkey_val_end(inode);
-       u64 field[2];
-       unsigned fieldnr = 0, field_bits;
-       int ret;
-
-#define x(_name, _bits)                                                        \
-       if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) {                  \
-               unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
-               memset((void *) unpacked + offset, 0,                   \
-                      sizeof(*unpacked) - offset);                     \
-               return 0;                                               \
-       }                                                               \
-                                                                       \
-       ret = inode_decode_field(in, end, field, &field_bits);          \
-       if (ret < 0)                                                    \
-               return ret;                                             \
-                                                                       \
-       if (field_bits > sizeof(unpacked->_name) * 8)                   \
-               return -BCH_ERR_inode_unpack_error;                     \
-                                                                       \
-       unpacked->_name = field[1];                                     \
-       in += ret;
-
-       BCH_INODE_FIELDS_v2()
-#undef  x
-
-       /* XXX: signal if there were more fields than expected? */
-       return 0;
-}
-
-static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
-                               const u8 *in, const u8 *end,
-                               unsigned nr_fields)
-{
-       unsigned fieldnr = 0;
-       int ret;
-       u64 v[2];
-
-#define x(_name, _bits)                                                        \
-       if (fieldnr < nr_fields) {                                      \
-               ret = bch2_varint_decode_fast(in, end, &v[0]);          \
-               if (ret < 0)                                            \
-                       return ret;                                     \
-               in += ret;                                              \
-                                                                       \
-               if (_bits > 64) {                                       \
-                       ret = bch2_varint_decode_fast(in, end, &v[1]);  \
-                       if (ret < 0)                                    \
-                               return ret;                             \
-                       in += ret;                                      \
-               } else {                                                \
-                       v[1] = 0;                                       \
-               }                                                       \
-       } else {                                                        \
-               v[0] = v[1] = 0;                                        \
-       }                                                               \
-                                                                       \
-       unpacked->_name = v[0];                                         \
-       if (v[1] || v[0] != unpacked->_name)                            \
-               return -BCH_ERR_inode_unpack_error;                     \
-       fieldnr++;
-
-       BCH_INODE_FIELDS_v2()
-#undef  x
-
-       /* XXX: signal if there were more fields than expected? */
-       return 0;
-}
-
-static int bch2_inode_unpack_v3(struct bkey_s_c k,
-                               struct bch_inode_unpacked *unpacked)
-{
-       struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
-       const u8 *in = inode.v->fields;
-       const u8 *end = bkey_val_end(inode);
-       unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
-       unsigned fieldnr = 0;
-       int ret;
-       u64 v[2];
-
-       unpacked->bi_inum       = inode.k->p.offset;
-       unpacked->bi_snapshot   = inode.k->p.snapshot;
-       unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
-       unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
-       unpacked->bi_flags      = le64_to_cpu(inode.v->bi_flags);
-       unpacked->bi_sectors    = le64_to_cpu(inode.v->bi_sectors);
-       unpacked->bi_size       = le64_to_cpu(inode.v->bi_size);
-       unpacked->bi_version    = le64_to_cpu(inode.v->bi_version);
-       unpacked->bi_mode       = INODEv3_MODE(inode.v);
-
-#define x(_name, _bits)                                                        \
-       if (fieldnr < nr_fields) {                                      \
-               ret = bch2_varint_decode_fast(in, end, &v[0]);          \
-               if (ret < 0)                                            \
-                       return ret;                                     \
-               in += ret;                                              \
-                                                                       \
-               if (_bits > 64) {                                       \
-                       ret = bch2_varint_decode_fast(in, end, &v[1]);  \
-                       if (ret < 0)                                    \
-                               return ret;                             \
-                       in += ret;                                      \
-               } else {                                                \
-                       v[1] = 0;                                       \
-               }                                                       \
-       } else {                                                        \
-               v[0] = v[1] = 0;                                        \
-       }                                                               \
-                                                                       \
-       unpacked->_name = v[0];                                         \
-       if (v[1] || v[0] != unpacked->_name)                            \
-               return -BCH_ERR_inode_unpack_error;                     \
-       fieldnr++;
-
-       BCH_INODE_FIELDS_v3()
-#undef  x
-
-       /* XXX: signal if there were more fields than expected? */
-       return 0;
-}
-
-static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
-                                              struct bch_inode_unpacked *unpacked)
-{
-       memset(unpacked, 0, sizeof(*unpacked));
-
-       switch (k.k->type) {
-       case KEY_TYPE_inode: {
-               struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-
-               unpacked->bi_inum       = inode.k->p.offset;
-               unpacked->bi_snapshot   = inode.k->p.snapshot;
-               unpacked->bi_journal_seq= 0;
-               unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
-               unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
-               unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
-
-               if (INODEv1_NEW_VARINT(inode.v)) {
-                       return bch2_inode_unpack_v2(unpacked, inode.v->fields,
-                                                   bkey_val_end(inode),
-                                                   INODEv1_NR_FIELDS(inode.v));
-               } else {
-                       return bch2_inode_unpack_v1(inode, unpacked);
-               }
-               break;
-       }
-       case KEY_TYPE_inode_v2: {
-               struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
-
-               unpacked->bi_inum       = inode.k->p.offset;
-               unpacked->bi_snapshot   = inode.k->p.snapshot;
-               unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
-               unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
-               unpacked->bi_flags      = le64_to_cpu(inode.v->bi_flags);
-               unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
-
-               return bch2_inode_unpack_v2(unpacked, inode.v->fields,
-                                           bkey_val_end(inode),
-                                           INODEv2_NR_FIELDS(inode.v));
-       }
-       default:
-               BUG();
-       }
-}
-
-int bch2_inode_unpack(struct bkey_s_c k,
-                     struct bch_inode_unpacked *unpacked)
-{
-       return likely(k.k->type == KEY_TYPE_inode_v3)
-               ? bch2_inode_unpack_v3(k, unpacked)
-               : bch2_inode_unpack_slowpath(k, unpacked);
-}
-
-int __bch2_inode_peek(struct btree_trans *trans,
-                     struct btree_iter *iter,
-                     struct bch_inode_unpacked *inode,
-                     subvol_inum inum, unsigned flags,
-                     bool warn)
-{
-       u32 snapshot;
-       int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn);
-       if (ret)
-               return ret;
-
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
-                                              SPOS(0, inum.inum, snapshot),
-                                              flags|BTREE_ITER_cached);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
-       if (ret)
-               goto err;
-
-       ret = bch2_inode_unpack(k, inode);
-       if (ret)
-               goto err;
-
-       return 0;
-err:
-       if (warn)
-               bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
-       bch2_trans_iter_exit(trans, iter);
-       return ret;
-}
-
-int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans,
-                                           u64 inode_nr, u32 snapshot,
-                                           struct bch_inode_unpacked *inode,
-                                           unsigned flags)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-                                              SPOS(0, inode_nr, snapshot), flags);
-       int ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       ret = bkey_is_inode(k.k)
-               ? bch2_inode_unpack(k, inode)
-               : -BCH_ERR_ENOENT_inode;
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
-                                 subvol_inum inum,
-                                 struct bch_inode_unpacked *inode)
-{
-       struct btree_iter iter;
-       int ret;
-
-       ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
-       if (!ret)
-               bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
-                                 subvol_inum inum,
-                                 struct bch_inode_unpacked *inode)
-{
-       struct btree_iter iter;
-       int ret;
-
-       ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
-       if (!ret)
-               bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
-                           struct bch_inode_unpacked *inode)
-{
-       return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
-}
-
-int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
-                                 struct bch_inode_unpacked *root)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
-                                            SPOS(0, inum, U32_MAX),
-                                            BTREE_ITER_all_snapshots, k, ret) {
-               if (k.k->p.offset != inum)
-                       break;
-               if (bkey_is_inode(k.k)) {
-                       ret = bch2_inode_unpack(k, root);
-                       goto out;
-               }
-       }
-       /* We're only called when we know we have an inode for @inum */
-       BUG_ON(!ret);
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_inode_write_flags(struct btree_trans *trans,
-                    struct btree_iter *iter,
-                    struct bch_inode_unpacked *inode,
-                    enum btree_iter_update_trigger_flags flags)
-{
-       struct bkey_inode_buf *inode_p;
-
-       inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-       if (IS_ERR(inode_p))
-               return PTR_ERR(inode_p);
-
-       bch2_inode_pack_inlined(inode_p, inode);
-       inode_p->inode.k.p.snapshot = iter->snapshot;
-       return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
-}
-
-int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
-       struct bkey_inode_buf *inode_p =
-               bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
-       if (IS_ERR(inode_p))
-               return PTR_ERR(inode_p);
-
-       bch2_inode_pack(inode_p, inode);
-       inode_p->inode.k.p.snapshot = inode->bi_snapshot;
-
-       return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
-                               &inode_p->inode.k_i,
-                               BTREE_UPDATE_internal_snapshot_node);
-}
-
-int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
-       int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                           __bch2_fsck_write_inode(trans, inode));
-       bch_err_fn(trans->c, ret);
-       return ret;
-}
-
-struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
-{
-       struct bch_inode_unpacked u;
-       struct bkey_inode_buf *inode_p;
-       int ret;
-
-       if (!bkey_is_inode(&k->k))
-               return ERR_PTR(-ENOENT);
-
-       inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-       if (IS_ERR(inode_p))
-               return ERR_CAST(inode_p);
-
-       ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
-       if (ret)
-               return ERR_PTR(ret);
-
-       bch2_inode_pack(inode_p, &u);
-       return &inode_p->inode.k_i;
-}
-
-static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
-                                struct bkey_validate_context from)
-{
-       struct bch_inode_unpacked unpacked;
-       int ret = 0;
-
-       bkey_fsck_err_on(k.k->p.inode,
-                        c, inode_pos_inode_nonzero,
-                        "nonzero k.p.inode");
-
-       bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX,
-                        c, inode_pos_blockdev_range,
-                        "fs inode in blockdev range");
-
-       bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked),
-                        c, inode_unpack_error,
-                        "invalid variable length fields");
-
-       bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1,
-                        c, inode_checksum_type_invalid,
-                        "invalid data checksum type (%u >= %u",
-                        unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
-
-       bkey_fsck_err_on(unpacked.bi_compression &&
-                        !bch2_compression_opt_valid(unpacked.bi_compression - 1),
-                        c, inode_compression_type_invalid,
-                        "invalid compression opt %u", unpacked.bi_compression - 1);
-
-       bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
-                        unpacked.bi_nlink != 0,
-                        c, inode_unlinked_but_nlink_nonzero,
-                        "flagged as unlinked but bi_nlink != 0");
-
-       bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode),
-                        c, inode_subvol_root_but_not_dir,
-                        "subvolume root but not a directory");
-fsck_err:
-       return ret;
-}
-
-int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
-                       struct bkey_validate_context from)
-{
-       struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-       int ret = 0;
-
-       bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
-                        c, inode_str_hash_invalid,
-                        "invalid str hash type (%llu >= %u)",
-                        INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
-       ret = __bch2_inode_validate(c, k, from);
-fsck_err:
-       return ret;
-}
-
-int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
-                          struct bkey_validate_context from)
-{
-       struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
-       int ret = 0;
-
-       bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
-                        c, inode_str_hash_invalid,
-                        "invalid str hash type (%llu >= %u)",
-                        INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
-       ret = __bch2_inode_validate(c, k, from);
-fsck_err:
-       return ret;
-}
-
-int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
-                          struct bkey_validate_context from)
-{
-       struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
-       int ret = 0;
-
-       bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
-                        INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k),
-                        c, inode_v3_fields_start_bad,
-                        "invalid fields_start (got %llu, min %u max %zu)",
-                        INODEv3_FIELDS_START(inode.v),
-                        INODEv3_FIELDS_START_INITIAL,
-                        bkey_val_u64s(inode.k));
-
-       bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
-                        c, inode_str_hash_invalid,
-                        "invalid str hash type (%llu >= %u)",
-                        INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
-       ret = __bch2_inode_validate(c, k, from);
-fsck_err:
-       return ret;
-}
-
-static void __bch2_inode_unpacked_to_text(struct printbuf *out,
-                                         struct bch_inode_unpacked *inode)
-{
-       prt_printf(out, "\n");
-       printbuf_indent_add(out, 2);
-       prt_printf(out, "mode=%o\n", inode->bi_mode);
-
-       prt_str(out, "flags=");
-       prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
-       prt_printf(out, "(%x)\n", inode->bi_flags);
-
-       prt_printf(out, "journal_seq=%llu\n",   inode->bi_journal_seq);
-       prt_printf(out, "hash_seed=%llx\n",     inode->bi_hash_seed);
-       prt_printf(out, "hash_type=");
-       bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
-       prt_newline(out);
-       prt_printf(out, "bi_size=%llu\n",       inode->bi_size);
-       prt_printf(out, "bi_sectors=%llu\n",    inode->bi_sectors);
-       prt_printf(out, "bi_version=%llu\n",    inode->bi_version);
-
-#define x(_name, _bits)                                                \
-       prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
-       BCH_INODE_FIELDS_v3()
-#undef  x
-
-       bch2_printbuf_strip_trailing_newline(out);
-       printbuf_indent_sub(out, 2);
-}
-
-void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
-{
-       prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
-       __bch2_inode_unpacked_to_text(out, inode);
-}
-
-void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bch_inode_unpacked inode;
-
-       if (bch2_inode_unpack(k, &inode)) {
-               prt_printf(out, "(unpack error)");
-               return;
-       }
-
-       __bch2_inode_unpacked_to_text(out, &inode);
-}
-
-static inline u64 bkey_inode_flags(struct bkey_s_c k)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_inode:
-               return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
-       case KEY_TYPE_inode_v2:
-               return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
-       case KEY_TYPE_inode_v3:
-               return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
-       default:
-               return 0;
-       }
-}
-
-static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_inode:
-               bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
-               return;
-       case KEY_TYPE_inode_v2:
-               bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
-               return;
-       case KEY_TYPE_inode_v3:
-               bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
-               return;
-       default:
-               BUG();
-       }
-}
-
-static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
-{
-       unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
-
-       return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
-}
-
-static struct bkey_s_c
-bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
-                                  enum btree_id btree, struct bpos pos,
-                                  unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       for_each_btree_key_max_norestart(trans, *iter, btree,
-                                         bpos_successor(pos),
-                                         SPOS(pos.inode, pos.offset, U32_MAX),
-                                         flags|BTREE_ITER_all_snapshots, k, ret)
-               if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
-                       return k;
-
-       bch2_trans_iter_exit(trans, iter);
-       return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-}
-
-static struct bkey_s_c
-bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
-                                   struct bpos pos, unsigned flags)
-{
-       struct bkey_s_c k;
-again:
-       k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
-       if (!k.k ||
-           bkey_err(k) ||
-           bkey_is_inode(k.k))
-               return k;
-
-       bch2_trans_iter_exit(trans, iter);
-       pos = k.k->p;
-       goto again;
-}
-
-int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       for_each_btree_key_max_norestart(trans, iter,
-                       BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
-                       BTREE_ITER_all_snapshots|
-                       BTREE_ITER_with_updates, k, ret)
-               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
-                   bkey_is_inode(k.k)) {
-                       ret = 1;
-                       break;
-               }
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int update_inode_has_children(struct btree_trans *trans,
-                                    struct bkey_s k,
-                                    bool have_child)
-{
-       if (!have_child) {
-               int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
-               if (ret)
-                       return ret < 0 ? ret : 0;
-       }
-
-       u64 f = bkey_inode_flags(k.s_c);
-       if (have_child != !!(f & BCH_INODE_has_child_snapshot))
-               bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
-
-       return 0;
-}
-
-static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
-                                           bool have_child)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
-                                               &iter, pos, BTREE_ITER_with_updates);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-       if (!k.k)
-               return 0;
-
-       if (!have_child) {
-               ret = bch2_inode_has_child_snapshots(trans, k.k->p);
-               if (ret) {
-                       ret = ret < 0 ? ret : 0;
-                       goto err;
-               }
-       }
-
-       u64 f = bkey_inode_flags(k);
-       if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
-               struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
-                                            BTREE_UPDATE_internal_snapshot_node);
-               ret = PTR_ERR_OR_ZERO(update);
-               if (ret)
-                       goto err;
-
-               bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_trigger_inode(struct btree_trans *trans,
-                      enum btree_id btree_id, unsigned level,
-                      struct bkey_s_c old,
-                      struct bkey_s new,
-                      enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-
-       if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
-               BUG_ON(!trans->journal_res.seq);
-               bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
-       }
-
-       s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) };
-       if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) {
-               int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes);
-               if (ret)
-                       return ret;
-       }
-
-       if (flags & BTREE_TRIGGER_transactional) {
-               int unlinked_delta =    (int) bkey_is_unlinked_inode(new.s_c) -
-                                       (int) bkey_is_unlinked_inode(old);
-               if (unlinked_delta) {
-                       int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
-                                                             new.k->p, unlinked_delta > 0);
-                       if (ret)
-                               return ret;
-               }
-
-               /*
-                * If we're creating or deleting an inode at this snapshot ID,
-                * and there might be an inode in a parent snapshot ID, we might
-                * need to set or clear the has_child_snapshot flag on the
-                * parent.
-                */
-               int deleted_delta = (int) bkey_is_inode(new.k) -
-                                   (int) bkey_is_inode(old.k);
-               if (deleted_delta &&
-                   bch2_snapshot_parent(c, new.k->p.snapshot)) {
-                       int ret = update_parent_inode_has_children(trans, new.k->p,
-                                                                  deleted_delta > 0);
-                       if (ret)
-                               return ret;
-               }
-
-               /*
-                * When an inode is first updated in a new snapshot, we may need
-                * to clear has_child_snapshot
-                */
-               if (deleted_delta > 0) {
-                       int ret = update_inode_has_children(trans, new, false);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       return 0;
-}
-
-int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
-                                  struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       bkey_fsck_err_on(k.k->p.inode,
-                        c, inode_pos_inode_nonzero,
-                        "nonzero k.p.inode");
-fsck_err:
-       return ret;
-}
-
-void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
-                                  struct bkey_s_c k)
-{
-       struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
-
-       prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
-}
-
-int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k,
-                                  struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors,
-                        c, inode_alloc_cursor_inode_bad,
-                        "k.p.inode bad");
-fsck_err:
-       return ret;
-}
-
-void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c,
-                                    struct bkey_s_c k)
-{
-       struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k);
-
-       prt_printf(out, "idx %llu generation %llu",
-                  le64_to_cpu(i.v->idx),
-                  le64_to_cpu(i.v->gen));
-}
-
-void bch2_inode_init_early(struct bch_fs *c,
-                          struct bch_inode_unpacked *inode_u)
-{
-       enum bch_str_hash_type str_hash =
-               bch2_str_hash_opt_to_type(c, c->opts.str_hash);
-
-       memset(inode_u, 0, sizeof(*inode_u));
-
-       SET_INODE_STR_HASH(inode_u, str_hash);
-       get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
-}
-
-void bch2_inode_init_late(struct bch_fs *c,
-                         struct bch_inode_unpacked *inode_u, u64 now,
-                         uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-                         struct bch_inode_unpacked *parent)
-{
-       inode_u->bi_mode        = mode;
-       inode_u->bi_uid         = uid;
-       inode_u->bi_gid         = gid;
-       inode_u->bi_dev         = rdev;
-       inode_u->bi_atime       = now;
-       inode_u->bi_mtime       = now;
-       inode_u->bi_ctime       = now;
-       inode_u->bi_otime       = now;
-
-       if (parent && parent->bi_mode & S_ISGID) {
-               inode_u->bi_gid = parent->bi_gid;
-               if (S_ISDIR(mode))
-                       inode_u->bi_mode |= S_ISGID;
-       }
-
-       if (parent) {
-#define x(_name, ...)  inode_u->bi_##_name = parent->bi_##_name;
-               BCH_INODE_OPTS()
-#undef x
-       }
-
-       if (!S_ISDIR(mode))
-               inode_u->bi_casefold = 0;
-
-       if (bch2_inode_casefold(c, inode_u))
-               inode_u->bi_flags |= BCH_INODE_has_case_insensitive;
-}
-
-void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-                    uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-                    struct bch_inode_unpacked *parent)
-{
-       bch2_inode_init_early(c, inode_u);
-       bch2_inode_init_late(c, inode_u, bch2_current_time(c),
-                            uid, gid, mode, rdev, parent);
-}
-
-static struct bkey_i_inode_alloc_cursor *
-bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
-{
-       struct bch_fs *c = trans->c;
-
-       u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
-
-       cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
-
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
-                                       BTREE_ID_logged_ops,
-                                       POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx),
-                                       BTREE_ITER_cached);
-       int ret = bkey_err(k);
-       if (ret)
-               return ERR_PTR(ret);
-
-       struct bkey_i_inode_alloc_cursor *cursor =
-               k.k->type == KEY_TYPE_inode_alloc_cursor
-               ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor)
-               : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor);
-       ret = PTR_ERR_OR_ZERO(cursor);
-       if (ret)
-               goto err;
-
-       if (c->opts.inodes_32bit) {
-               *min = BLOCKDEV_INODE_MAX;
-               *max = INT_MAX;
-       } else {
-               cursor->v.bits = c->opts.shard_inode_numbers_bits;
-
-               unsigned bits = 63 - c->opts.shard_inode_numbers_bits;
-
-               *min = max(cpu << bits, (u64) INT_MAX + 1);
-               *max = (cpu << bits) | ~(ULLONG_MAX << bits);
-       }
-
-       if (le64_to_cpu(cursor->v.idx)  < *min)
-               cursor->v.idx = cpu_to_le64(*min);
-
-       if (le64_to_cpu(cursor->v.idx) >= *max) {
-               cursor->v.idx = cpu_to_le64(*min);
-               le32_add_cpu(&cursor->v.gen, 1);
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret ? ERR_PTR(ret) : cursor;
-}
-
-/*
- * This just finds an empty slot:
- */
-int bch2_inode_create(struct btree_trans *trans,
-                     struct btree_iter *iter,
-                     struct bch_inode_unpacked *inode_u,
-                     u32 snapshot, u64 cpu)
-{
-       u64 min, max;
-       struct bkey_i_inode_alloc_cursor *cursor =
-               bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
-       int ret = PTR_ERR_OR_ZERO(cursor);
-       if (ret)
-               return ret;
-
-       u64 start = le64_to_cpu(cursor->v.idx);
-       u64 pos = start;
-
-       bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
-                            BTREE_ITER_all_snapshots|
-                            BTREE_ITER_intent);
-       struct bkey_s_c k;
-again:
-       while ((k = bch2_btree_iter_peek(trans, iter)).k &&
-              !(ret = bkey_err(k)) &&
-              bkey_lt(k.k->p, POS(0, max))) {
-               if (pos < iter->pos.offset)
-                       goto found_slot;
-
-               /*
-                * We don't need to iterate over keys in every snapshot once
-                * we've found just one:
-                */
-               pos = iter->pos.offset + 1;
-               bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
-       }
-
-       if (!ret && pos < max)
-               goto found_slot;
-
-       if (!ret && start == min)
-               ret = bch_err_throw(trans->c, ENOSPC_inode_create);
-
-       if (ret) {
-               bch2_trans_iter_exit(trans, iter);
-               return ret;
-       }
-
-       /* Retry from start */
-       pos = start = min;
-       bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
-       le32_add_cpu(&cursor->v.gen, 1);
-       goto again;
-found_slot:
-       bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot));
-       k = bch2_btree_iter_peek_slot(trans, iter);
-       ret = bkey_err(k);
-       if (ret) {
-               bch2_trans_iter_exit(trans, iter);
-               return ret;
-       }
-
-       inode_u->bi_inum        = k.k->p.offset;
-       inode_u->bi_generation  = le64_to_cpu(cursor->v.gen);
-       cursor->v.idx           = cpu_to_le64(k.k->p.offset + 1);
-       return 0;
-}
-
-static int bch2_inode_delete_keys(struct btree_trans *trans,
-                                 subvol_inum inum, enum btree_id id)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_i delete;
-       struct bpos end = POS(inum.inum, U64_MAX);
-       u32 snapshot;
-       int ret = 0;
-
-       /*
-        * We're never going to be deleting partial extents, no need to use an
-        * extent iterator:
-        */
-       bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
-                            BTREE_ITER_intent);
-
-       while (1) {
-               bch2_trans_begin(trans);
-
-               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-               if (ret)
-                       goto err;
-
-               bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
-               k = bch2_btree_iter_peek_max(trans, &iter, end);
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               if (!k.k)
-                       break;
-
-               bkey_init(&delete.k);
-               delete.k.p = iter.pos;
-
-               if (iter.flags & BTREE_ITER_is_extents)
-                       bch2_key_resize(&delete.k,
-                                       bpos_min(end, k.k->p).offset -
-                                       iter.pos.offset);
-
-               ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
-                     bch2_trans_commit(trans, NULL, NULL,
-                                       BCH_TRANS_COMMIT_no_enospc);
-err:
-               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       break;
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter = {};
-       struct bkey_s_c k;
-       u32 snapshot;
-       int ret;
-
-       ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum));
-       if (ret)
-               goto err2;
-
-       /*
-        * If this was a directory, there shouldn't be any real dirents left -
-        * but there could be whiteouts (from hash collisions) that we should
-        * delete:
-        *
-        * XXX: the dirent code ideally would delete whiteouts when they're no
-        * longer needed
-        */
-       ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
-               bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
-               bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
-       if (ret)
-               goto err2;
-retry:
-       bch2_trans_begin(trans);
-
-       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-       if (ret)
-               goto err;
-
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-                              SPOS(0, inum.inum, snapshot),
-                              BTREE_ITER_intent|BTREE_ITER_cached);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       if (!bkey_is_inode(k.k)) {
-               bch2_fs_inconsistent(c,
-                                    "inode %llu:%u not found when deleting",
-                                    inum.inum, snapshot);
-               ret = bch_err_throw(c, ENOENT_inode);
-               goto err;
-       }
-
-       ret   = bch2_btree_delete_at(trans, &iter, 0) ?:
-               bch2_trans_commit(trans, NULL, NULL,
-                               BCH_TRANS_COMMIT_no_enospc);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-
-       if (ret)
-               goto err2;
-
-       ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
-err2:
-       bch2_trans_put(trans);
-       return ret;
-}
-
-int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
-{
-       if (bi->bi_flags & BCH_INODE_unlinked)
-               bi->bi_flags &= ~BCH_INODE_unlinked;
-       else {
-               if (bi->bi_nlink == U32_MAX)
-                       return -EINVAL;
-
-               bi->bi_nlink++;
-       }
-
-       return 0;
-}
-
-void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
-{
-       if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
-               bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
-                                       bi->bi_inum);
-               return;
-       }
-
-       if (bi->bi_flags & BCH_INODE_unlinked) {
-               bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
-               return;
-       }
-
-       if (bi->bi_nlink)
-               bi->bi_nlink--;
-       else
-               bi->bi_flags |= BCH_INODE_unlinked;
-}
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
-{
-       struct bch_opts ret = { 0 };
-#define x(_name, _bits)                                                        \
-       if (inode->bi_##_name)                                          \
-               opt_set(ret, _name, inode->bi_##_name - 1);
-       BCH_INODE_OPTS()
-#undef x
-       return ret;
-}
-
-void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
-                        struct bch_inode_unpacked *inode)
-{
-#define x(_name, _bits)                                                        \
-       if ((inode)->bi_##_name) {                                      \
-               opts->_name = inode->bi_##_name - 1;                    \
-               opts->_name##_from_inode = true;                        \
-       } else {                                                        \
-               opts->_name = c->opts._name;                            \
-               opts->_name##_from_inode = false;                       \
-       }
-       BCH_INODE_OPTS()
-#undef x
-
-       bch2_io_opts_fixups(opts);
-}
-
-int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
-{
-       struct bch_inode_unpacked inode;
-       int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
-
-       if (ret)
-               return ret;
-
-       bch2_inode_opts_get(opts, trans->c, &inode);
-       return 0;
-}
-
-int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum,
-                           struct bch_inode_unpacked *bi, unsigned v)
-{
-       struct bch_fs *c = trans->c;
-
-#ifndef CONFIG_UNICODE
-       bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE");
-       return -EOPNOTSUPP;
-#endif
-
-       if (c->opts.casefold_disabled)
-               return -EOPNOTSUPP;
-
-       int ret = 0;
-       /* Not supported on individual files. */
-       if (!S_ISDIR(bi->bi_mode))
-               return -EOPNOTSUPP;
-
-       /*
-        * Make sure the dir is empty, as otherwise we'd need to
-        * rehash everything and update the dirent keys.
-        */
-       ret = bch2_empty_dir_trans(trans, inum);
-       if (ret < 0)
-               return ret;
-
-       ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
-       if (ret)
-               return ret;
-
-       bch2_check_set_feature(c, BCH_FEATURE_casefolding);
-
-       bi->bi_casefold = v + 1;
-       bi->bi_fields_set |= BIT(Inode_opt_casefold);
-
-       return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi);
-}
-
-static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter = {};
-       struct bkey_i_inode_generation delete;
-       struct bch_inode_unpacked inode_u;
-       struct bkey_s_c k;
-       int ret;
-
-       do {
-               ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-                                                     SPOS(inum, 0, snapshot),
-                                                     SPOS(inum, U64_MAX, snapshot),
-                                                     0, NULL) ?:
-                       bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
-                                                     SPOS(inum, 0, snapshot),
-                                                     SPOS(inum, U64_MAX, snapshot),
-                                                     0, NULL) ?:
-                       bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
-                                                     SPOS(inum, 0, snapshot),
-                                                     SPOS(inum, U64_MAX, snapshot),
-                                                     0, NULL);
-       } while (ret == -BCH_ERR_transaction_restart_nested);
-       if (ret)
-               goto err;
-retry:
-       bch2_trans_begin(trans);
-
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-                              SPOS(0, inum, snapshot), BTREE_ITER_intent);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       if (!bkey_is_inode(k.k)) {
-               bch2_fs_inconsistent(c,
-                                    "inode %llu:%u not found when deleting",
-                                    inum, snapshot);
-               ret = bch_err_throw(c, ENOENT_inode);
-               goto err;
-       }
-
-       bch2_inode_unpack(k, &inode_u);
-
-       /* Subvolume root? */
-       if (inode_u.bi_subvol)
-               bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
-
-       bkey_inode_generation_init(&delete.k_i);
-       delete.k.p = iter.pos;
-       delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
-       ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
-               bch2_trans_commit(trans, NULL, NULL,
-                               BCH_TRANS_COMMIT_no_enospc);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-
-       return ret ?: -BCH_ERR_transaction_restart_nested;
-}
-
-/*
- * After deleting an inode, there may be versions in older snapshots that should
- * also be deleted - if they're not referenced by sibling snapshots and not open
- * in other subvolumes:
- */
-static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-next_parent:
-       ret = lockrestart_do(trans,
-               bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
-       if (ret || !k.k)
-               return ret;
-
-       bool unlinked = bkey_is_unlinked_inode(k);
-       pos = k.k->p;
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (!unlinked)
-               return 0;
-
-       ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
-       if (ret)
-               return ret < 0 ? ret : 0;
-
-       ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
-       if (ret)
-               return ret;
-       goto next_parent;
-}
-
-int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
-{
-       return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
-               delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
-}
-
-static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos,
-                                   bool from_deleted_inodes)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter inode_iter;
-       struct bkey_s_c k;
-       struct bch_inode_unpacked inode;
-       struct printbuf buf = PRINTBUF;
-       int ret;
-
-       k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode);
-       if (fsck_err_on(from_deleted_inodes && ret,
-                       trans, deleted_inode_missing,
-                       "nonexistent inode %llu:%u in deleted_inodes btree",
-                       pos.offset, pos.snapshot))
-               goto delete;
-       if (ret)
-               goto out;
-
-       ret = bch2_inode_unpack(k, &inode);
-       if (ret)
-               goto out;
-
-       if (S_ISDIR(inode.bi_mode)) {
-               ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
-               if (fsck_err_on(from_deleted_inodes &&
-                               bch2_err_matches(ret, ENOTEMPTY),
-                               trans, deleted_inode_is_dir,
-                               "non empty directory %llu:%u in deleted_inodes btree",
-                               pos.offset, pos.snapshot))
-                       goto delete;
-               if (ret)
-                       goto out;
-       }
-
-       ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked);
-       if (fsck_err_on(from_deleted_inodes && ret,
-                       trans, deleted_inode_not_unlinked,
-                       "non-deleted inode %llu:%u in deleted_inodes btree",
-                       pos.offset, pos.snapshot))
-               goto delete;
-       if (ret)
-               goto out;
-
-       ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot)
-               ? 0 : bch_err_throw(c, inode_has_child_snapshot);
-
-       if (fsck_err_on(from_deleted_inodes && ret,
-                       trans, deleted_inode_has_child_snapshots,
-                       "inode with child snapshots %llu:%u in deleted_inodes btree",
-                       pos.offset, pos.snapshot))
-               goto delete;
-       if (ret)
-               goto out;
-
-       ret = bch2_inode_has_child_snapshots(trans, k.k->p);
-       if (ret < 0)
-               goto out;
-
-       if (ret) {
-               if (fsck_err(trans, inode_has_child_snapshots_wrong,
-                            "inode has_child_snapshots flag wrong (should be set)\n%s",
-                            (printbuf_reset(&buf),
-                             bch2_inode_unpacked_to_text(&buf, &inode),
-                             buf.buf))) {
-                       inode.bi_flags |= BCH_INODE_has_child_snapshot;
-                       ret = __bch2_fsck_write_inode(trans, &inode);
-                       if (ret)
-                               goto out;
-               }
-
-               if (!from_deleted_inodes) {
-                       ret =   bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-                               bch_err_throw(c, inode_has_child_snapshot);
-                       goto out;
-               }
-
-               goto delete;
-
-       }
-
-       if (from_deleted_inodes) {
-               if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
-                   !fsck_err(trans, deleted_inode_but_clean,
-                             "filesystem marked as clean but have deleted inode %llu:%u",
-                             pos.offset, pos.snapshot)) {
-                       ret = 0;
-                       goto out;
-               }
-
-               ret = 1;
-       }
-out:
-fsck_err:
-       bch2_trans_iter_exit(trans, &inode_iter);
-       printbuf_exit(&buf);
-       return ret;
-delete:
-       ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
-       goto out;
-}
-
-static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum)
-{
-       u32 snapshot;
-
-       return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
-               may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false);
-}
-
-int bch2_delete_dead_inodes(struct bch_fs *c)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       int ret;
-
-       /*
-        * if we ran check_inodes() unlinked inodes will have already been
-        * cleaned up but the write buffer will be out of sync; therefore we
-        * alway need a write buffer flush
-        */
-       ret = bch2_btree_write_buffer_flush_sync(trans);
-       if (ret)
-               goto err;
-
-       /*
-        * Weird transaction restart handling here because on successful delete,
-        * bch2_inode_rm_snapshot() will return a nested transaction restart,
-        * but we can't retry because the btree write buffer won't have been
-        * flushed and we'd spin:
-        */
-       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
-                                       BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-                                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-               ret = may_delete_deleted_inode(trans, k.k->p, true);
-               if (ret > 0) {
-                       bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
-                                               k.k->p.offset, k.k->p.snapshot);
-
-                       ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
-                       /*
-                        * We don't want to loop here: a transaction restart
-                        * error here means we handled a transaction restart and
-                        * we're actually done, but if we loop we'll retry the
-                        * same key because the write buffer hasn't been flushed
-                        * yet
-                        */
-                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-                               ret = 0;
-                               continue;
-                       }
-               }
-
-               ret;
-       }));
-err:
-       bch2_trans_put(trans);
-       bch_err_fn(c, ret);
-       return ret;
-}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
deleted file mode 100644 (file)
index b8ec3e6..0000000
+++ /dev/null
@@ -1,319 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_INODE_H
-#define _BCACHEFS_INODE_H
-
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "opts.h"
-#include "snapshot.h"
-
-extern const char * const bch2_inode_opts[];
-
-int bch2_inode_validate(struct bch_fs *, struct bkey_s_c,
-                       struct bkey_validate_context);
-int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c,
-                          struct bkey_validate_context);
-int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
-                          struct bkey_validate_context);
-void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
-
-static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
-{
-       return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
-               ? __bch2_inode_has_child_snapshots(trans, pos)
-               : 0;
-}
-
-int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
-                      struct bkey_s_c, struct bkey_s,
-                      enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_inode ((struct bkey_ops) {       \
-       .key_validate   = bch2_inode_validate,          \
-       .val_to_text    = bch2_inode_to_text,           \
-       .trigger        = bch2_trigger_inode,           \
-       .min_val_size   = 16,                           \
-})
-
-#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) {    \
-       .key_validate   = bch2_inode_v2_validate,       \
-       .val_to_text    = bch2_inode_to_text,           \
-       .trigger        = bch2_trigger_inode,           \
-       .min_val_size   = 32,                           \
-})
-
-#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) {    \
-       .key_validate   = bch2_inode_v3_validate,       \
-       .val_to_text    = bch2_inode_to_text,           \
-       .trigger        = bch2_trigger_inode,           \
-       .min_val_size   = 48,                           \
-})
-
-static inline bool bkey_is_inode(const struct bkey *k)
-{
-       return  k->type == KEY_TYPE_inode ||
-               k->type == KEY_TYPE_inode_v2 ||
-               k->type == KEY_TYPE_inode_v3;
-}
-
-int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c,
-                                  struct bkey_validate_context);
-void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_inode_generation ((struct bkey_ops) {    \
-       .key_validate   = bch2_inode_generation_validate,       \
-       .val_to_text    = bch2_inode_generation_to_text,        \
-       .min_val_size   = 8,                                    \
-})
-
-int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c,
-                                    struct bkey_validate_context);
-void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) {  \
-       .key_validate   = bch2_inode_alloc_cursor_validate,     \
-       .val_to_text    = bch2_inode_alloc_cursor_to_text,      \
-       .min_val_size   = 16,                                   \
-})
-
-#if 0
-typedef struct {
-       u64                     lo;
-       u32                     hi;
-} __packed __aligned(4) u96;
-#endif
-typedef u64 u96;
-
-struct bch_inode_unpacked {
-       u64                     bi_inum;
-       u32                     bi_snapshot;
-       u64                     bi_journal_seq;
-       __le64                  bi_hash_seed;
-       u64                     bi_size;
-       u64                     bi_sectors;
-       u64                     bi_version;
-       u32                     bi_flags;
-       u16                     bi_mode;
-
-#define x(_name, _bits)        u##_bits _name;
-       BCH_INODE_FIELDS_v3()
-#undef  x
-};
-BITMASK(INODE_STR_HASH,        struct bch_inode_unpacked, bi_flags, 20, 24);
-
-struct bkey_inode_buf {
-       struct bkey_i_inode_v3  inode;
-
-#define x(_name, _bits)                + 8 + _bits / 8
-       u8              _pad[0 + BCH_INODE_FIELDS_v3()];
-#undef  x
-};
-
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
-struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
-
-void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
-
-int __bch2_inode_peek(struct btree_trans *, struct btree_iter *,
-                     struct bch_inode_unpacked *, subvol_inum, unsigned, bool);
-
-static inline int bch2_inode_peek_nowarn(struct btree_trans *trans,
-                                        struct btree_iter *iter,
-                                        struct bch_inode_unpacked *inode,
-                                        subvol_inum inum, unsigned flags)
-{
-       return __bch2_inode_peek(trans, iter, inode, inum, flags, false);
-}
-
-static inline int bch2_inode_peek(struct btree_trans *trans,
-                                 struct btree_iter *iter,
-                                 struct bch_inode_unpacked *inode,
-                                 subvol_inum inum, unsigned flags)
-{
-       return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
-}
-
-int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32,
-                                    struct bch_inode_unpacked *, unsigned);
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
-                                 subvol_inum,
-                                 struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
-                                 struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
-                           struct bch_inode_unpacked *);
-
-int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
-                                 struct bch_inode_unpacked *root);
-
-int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
-                    struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
-
-static inline int bch2_inode_write(struct btree_trans *trans,
-                    struct btree_iter *iter,
-                    struct bch_inode_unpacked *inode)
-{
-       return bch2_inode_write_flags(trans, iter, inode, 0);
-}
-
-int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
-int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
-
-void bch2_inode_init_early(struct bch_fs *,
-                          struct bch_inode_unpacked *);
-void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64,
-                         uid_t, gid_t, umode_t, dev_t,
-                         struct bch_inode_unpacked *);
-void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
-                    uid_t, gid_t, umode_t, dev_t,
-                    struct bch_inode_unpacked *);
-
-int bch2_inode_create(struct btree_trans *, struct btree_iter *,
-                     struct bch_inode_unpacked *, u32, u64);
-
-int bch2_inode_rm(struct bch_fs *, subvol_inum);
-
-#define inode_opt_get(_c, _inode, _name)                       \
-       ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
-
-static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
-                                     enum inode_opt_id id, u64 v)
-{
-       switch (id) {
-#define x(_name, ...)                                                  \
-       case Inode_opt_##_name:                                         \
-               inode->bi_##_name = v;                                  \
-               break;
-       BCH_INODE_OPTS()
-#undef x
-       default:
-               BUG();
-       }
-}
-
-static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
-                                    enum inode_opt_id id)
-{
-       switch (id) {
-#define x(_name, ...)                                                  \
-       case Inode_opt_##_name:                                         \
-               return inode->bi_##_name;
-       BCH_INODE_OPTS()
-#undef x
-       default:
-               BUG();
-       }
-}
-
-static inline u8 mode_to_type(umode_t mode)
-{
-       return (mode >> 12) & 15;
-}
-
-static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
-{
-       return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
-}
-
-static inline u32 bch2_inode_flags(struct bkey_s_c k)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_inode:
-               return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
-       case KEY_TYPE_inode_v2:
-               return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
-       case KEY_TYPE_inode_v3:
-               return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
-       default:
-               return 0;
-       }
-}
-
-static inline unsigned bkey_inode_mode(struct bkey_s_c k)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_inode:
-               return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode);
-       case KEY_TYPE_inode_v2:
-               return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode);
-       case KEY_TYPE_inode_v3:
-               return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v);
-       default:
-               return 0;
-       }
-}
-
-static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi)
-{
-       /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */
-       return bi->bi_casefold
-               ? bi->bi_casefold - 1
-               : c->opts.casefold;
-}
-
-static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi)
-{
-       return bi->bi_dir || bi->bi_dir_offset;
-}
-
-/* i_nlink: */
-
-static inline unsigned nlink_bias(umode_t mode)
-{
-       return S_ISDIR(mode) ? 2 : 1;
-}
-
-static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
-{
-       return bi->bi_flags & BCH_INODE_unlinked
-                 ? 0
-                 : bi->bi_nlink + nlink_bias(bi->bi_mode);
-}
-
-static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
-                                       unsigned nlink)
-{
-       if (nlink) {
-               bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
-               bi->bi_flags &= ~BCH_INODE_unlinked;
-       } else {
-               bi->bi_nlink = 0;
-               bi->bi_flags |= BCH_INODE_unlinked;
-       }
-}
-
-int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
-void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
-void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
-                        struct bch_inode_unpacked *);
-int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *);
-int bch2_inode_set_casefold(struct btree_trans *, subvol_inum,
-                           struct bch_inode_unpacked *, unsigned);
-
-#include "rebalance.h"
-
-static inline struct bch_extent_rebalance
-bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
-{
-       struct bch_io_opts io_opts;
-       bch2_inode_opts_get(&io_opts, c, inode);
-       return io_opts_to_rebalance_opts(c, &io_opts);
-}
-
-#define BCACHEFS_ROOT_SUBVOL_INUM                                      \
-       ((subvol_inum) { BCACHEFS_ROOT_SUBVOL,  BCACHEFS_ROOT_INO })
-
-static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b)
-{
-       return a.subvol == b.subvol && a.inum == b.inum;
-}
-
-int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
-int bch2_delete_dead_inodes(struct bch_fs *);
-
-#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
deleted file mode 100644 (file)
index 1f00938..0000000
+++ /dev/null
@@ -1,185 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_INODE_FORMAT_H
-#define _BCACHEFS_INODE_FORMAT_H
-
-#define BLOCKDEV_INODE_MAX     4096
-#define BCACHEFS_ROOT_INO      4096
-
-struct bch_inode {
-       struct bch_val          v;
-
-       __le64                  bi_hash_seed;
-       __le32                  bi_flags;
-       __le16                  bi_mode;
-       __u8                    fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
-       struct bch_val          v;
-
-       __le64                  bi_journal_seq;
-       __le64                  bi_hash_seed;
-       __le64                  bi_flags;
-       __le16                  bi_mode;
-       __u8                    fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
-       struct bch_val          v;
-
-       __le64                  bi_journal_seq;
-       __le64                  bi_hash_seed;
-       __le64                  bi_flags;
-       __le64                  bi_sectors;
-       __le64                  bi_size;
-       __le64                  bi_version;
-       __u8                    fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL   6
-#define INODEv3_FIELDS_START_CUR       (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
-       struct bch_val          v;
-
-       __le32                  bi_generation;
-       __le32                  pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2()                  \
-       x(bi_atime,                     96)     \
-       x(bi_ctime,                     96)     \
-       x(bi_mtime,                     96)     \
-       x(bi_otime,                     96)     \
-       x(bi_size,                      64)     \
-       x(bi_sectors,                   64)     \
-       x(bi_uid,                       32)     \
-       x(bi_gid,                       32)     \
-       x(bi_nlink,                     32)     \
-       x(bi_generation,                32)     \
-       x(bi_dev,                       32)     \
-       x(bi_data_checksum,             8)      \
-       x(bi_compression,               8)      \
-       x(bi_project,                   32)     \
-       x(bi_background_compression,    8)      \
-       x(bi_data_replicas,             8)      \
-       x(bi_promote_target,            16)     \
-       x(bi_foreground_target,         16)     \
-       x(bi_background_target,         16)     \
-       x(bi_erasure_code,              16)     \
-       x(bi_fields_set,                16)     \
-       x(bi_dir,                       64)     \
-       x(bi_dir_offset,                64)     \
-       x(bi_subvol,                    32)     \
-       x(bi_parent_subvol,             32)
-
-#define BCH_INODE_FIELDS_v3()                  \
-       x(bi_atime,                     96)     \
-       x(bi_ctime,                     96)     \
-       x(bi_mtime,                     96)     \
-       x(bi_otime,                     96)     \
-       x(bi_uid,                       32)     \
-       x(bi_gid,                       32)     \
-       x(bi_nlink,                     32)     \
-       x(bi_generation,                32)     \
-       x(bi_dev,                       32)     \
-       x(bi_data_checksum,             8)      \
-       x(bi_compression,               8)      \
-       x(bi_project,                   32)     \
-       x(bi_background_compression,    8)      \
-       x(bi_data_replicas,             8)      \
-       x(bi_promote_target,            16)     \
-       x(bi_foreground_target,         16)     \
-       x(bi_background_target,         16)     \
-       x(bi_erasure_code,              16)     \
-       x(bi_fields_set,                16)     \
-       x(bi_dir,                       64)     \
-       x(bi_dir_offset,                64)     \
-       x(bi_subvol,                    32)     \
-       x(bi_parent_subvol,             32)     \
-       x(bi_nocow,                     8)      \
-       x(bi_depth,                     32)     \
-       x(bi_inodes_32bit,              8)      \
-       x(bi_casefold,                  8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS()                       \
-       x(data_checksum,                8)      \
-       x(compression,                  8)      \
-       x(project,                      32)     \
-       x(background_compression,       8)      \
-       x(data_replicas,                8)      \
-       x(promote_target,               16)     \
-       x(foreground_target,            16)     \
-       x(background_target,            16)     \
-       x(erasure_code,                 16)     \
-       x(nocow,                        8)      \
-       x(inodes_32bit,                 8)      \
-       x(casefold,                     8)
-
-enum inode_opt_id {
-#define x(name, ...)                           \
-       Inode_opt_##name,
-       BCH_INODE_OPTS()
-#undef  x
-       Inode_opt_nr,
-};
-
-/*
- * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive -
- * for overlayfs
- */
-#define BCH_INODE_FLAGS()                      \
-       x(sync,                         0)      \
-       x(immutable,                    1)      \
-       x(append,                       2)      \
-       x(nodump,                       3)      \
-       x(noatime,                      4)      \
-       x(i_size_dirty,                 5)      \
-       x(i_sectors_dirty,              6)      \
-       x(unlinked,                     7)      \
-       x(backptr_untrusted,            8)      \
-       x(has_child_snapshot,           9)      \
-       x(has_case_insensitive,         10)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n)        BCH_INODE_##t = 1U << n,
-       BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n)        __BCH_INODE_##t = n,
-       BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODEv1_NR_FIELDS,        struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS,        struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS,        struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
-                               struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE,     struct bch_inode_v3, bi_flags, 36, 52);
-
-struct bch_inode_alloc_cursor {
-       struct bch_val          v;
-       __u8                    bits;
-       __u8                    pad;
-       __le32                  gen;
-       __le64                  idx;
-};
-
-#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
deleted file mode 100644 (file)
index 0702366..0000000
+++ /dev/null
@@ -1,570 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * io_misc.c - fallocate, fpunch, truncate:
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "logged_ops.h"
-#include "rebalance.h"
-#include "subvolume.h"
-
-/* Overwrites whatever was present with zeroes: */
-int bch2_extent_fallocate(struct btree_trans *trans,
-                         subvol_inum inum,
-                         struct btree_iter *iter,
-                         u64 sectors,
-                         struct bch_io_opts opts,
-                         s64 *i_sectors_delta,
-                         struct write_point_specifier write_point)
-{
-       struct bch_fs *c = trans->c;
-       struct disk_reservation disk_res = { 0 };
-       struct closure cl;
-       struct open_buckets open_buckets = { 0 };
-       struct bkey_s_c k;
-       struct bkey_buf old, new;
-       unsigned sectors_allocated = 0, new_replicas;
-       bool unwritten = opts.nocow &&
-           c->sb.version >= bcachefs_metadata_version_unwritten_extents;
-       int ret;
-
-       bch2_bkey_buf_init(&old);
-       bch2_bkey_buf_init(&new);
-       closure_init_stack(&cl);
-
-       k = bch2_btree_iter_peek_slot(trans, iter);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
-       new_replicas = max(0, (int) opts.data_replicas -
-                          (int) bch2_bkey_nr_ptrs_fully_allocated(k));
-
-       /*
-        * Get a disk reservation before (in the nocow case) calling
-        * into the allocator:
-        */
-       ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
-       if (unlikely(ret))
-               goto err_noprint;
-
-       bch2_bkey_buf_reassemble(&old, c, k);
-
-       if (!unwritten) {
-               struct bkey_i_reservation *reservation;
-
-               bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
-               reservation = bkey_reservation_init(new.k);
-               reservation->k.p = iter->pos;
-               bch2_key_resize(&reservation->k, sectors);
-               reservation->v.nr_replicas = opts.data_replicas;
-       } else {
-               struct bkey_i_extent *e;
-               struct bch_devs_list devs_have;
-               struct write_point *wp;
-
-               devs_have.nr = 0;
-
-               bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
-
-               e = bkey_extent_init(new.k);
-               e->k.p = iter->pos;
-
-               ret = bch2_alloc_sectors_start_trans(trans,
-                               opts.foreground_target,
-                               false,
-                               write_point,
-                               &devs_have,
-                               opts.data_replicas,
-                               opts.data_replicas,
-                               BCH_WATERMARK_normal, 0, &cl, &wp);
-               if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-                       ret = bch_err_throw(c, transaction_restart_nested);
-               if (ret)
-                       goto err;
-
-               sectors = min_t(u64, sectors, wp->sectors_free);
-               sectors_allocated = sectors;
-
-               bch2_key_resize(&e->k, sectors);
-
-               bch2_open_bucket_get(c, wp, &open_buckets);
-               bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
-               bch2_alloc_sectors_done(c, wp);
-
-               extent_for_each_ptr(extent_i_to_s(e), ptr)
-                       ptr->unwritten = true;
-       }
-
-       ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
-                                0, i_sectors_delta, true);
-err:
-       if (!ret && sectors_allocated)
-               bch2_increment_clock(c, sectors_allocated, WRITE);
-       if (should_print_err(ret)) {
-               struct printbuf buf = PRINTBUF;
-               lockrestart_do(trans,
-                       bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9));
-               prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
-               bch_err_ratelimited(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-       }
-err_noprint:
-       bch2_open_buckets_put(c, &open_buckets);
-       bch2_disk_reservation_put(c, &disk_res);
-       bch2_bkey_buf_exit(&new, c);
-       bch2_bkey_buf_exit(&old, c);
-
-       if (closure_nr_remaining(&cl) != 1) {
-               bch2_trans_unlock_long(trans);
-               bch2_wait_on_allocator(c, &cl);
-       }
-
-       return ret;
-}
-
-/* For fsck */
-int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end)
-{
-       u32 restart_count = trans->restart_count;
-       struct bch_fs *c = trans->c;
-       struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0);
-       unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
-       struct bkey_i delete;
-
-       int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
-                       start, end, 0, k,
-                       &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-               bkey_init(&delete.k);
-               delete.k.p = iter.pos;
-
-               /* create the biggest key we can */
-               bch2_key_resize(&delete.k, max_sectors);
-               bch2_cut_back(end, &delete);
-
-               bch2_extent_trim_atomic(trans, &iter, &delete) ?:
-               bch2_trans_update(trans, &iter, &delete, 0);
-       }));
-
-       bch2_disk_reservation_put(c, &disk_res);
-       return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-/*
- * Returns -BCH_ERR_transacton_restart if we had to drop locks:
- */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-                  subvol_inum inum, u64 end,
-                  s64 *i_sectors_delta)
-{
-       struct bch_fs *c        = trans->c;
-       unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
-       struct bpos end_pos = POS(inum.inum, end);
-       struct bkey_s_c k;
-       int ret = 0, ret2 = 0;
-       u32 snapshot;
-
-       while (!ret ||
-              bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(c, 0);
-               struct bkey_i delete;
-
-               if (ret)
-                       ret2 = ret;
-
-               bch2_trans_begin(trans);
-
-               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-               if (ret)
-                       continue;
-
-               bch2_btree_iter_set_snapshot(trans, iter, snapshot);
-
-               /*
-                * peek_max() doesn't have ideal semantics for extents:
-                */
-               k = bch2_btree_iter_peek_max(trans, iter, end_pos);
-               if (!k.k)
-                       break;
-
-               ret = bkey_err(k);
-               if (ret)
-                       continue;
-
-               bkey_init(&delete.k);
-               delete.k.p = iter->pos;
-
-               /* create the biggest key we can */
-               bch2_key_resize(&delete.k, max_sectors);
-               bch2_cut_back(end_pos, &delete);
-
-               ret = bch2_extent_update(trans, inum, iter, &delete,
-                               &disk_res, 0, i_sectors_delta, false);
-               bch2_disk_reservation_put(c, &disk_res);
-       }
-
-       return ret ?: ret2;
-}
-
-int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
-               s64 *i_sectors_delta)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            POS(inum.inum, start),
-                            BTREE_ITER_intent);
-
-       ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
-
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               ret = 0;
-
-       return ret;
-}
-
-/* truncate: */
-
-void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
-
-       prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
-       prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
-       prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
-}
-
-static int truncate_set_isize(struct btree_trans *trans,
-                             subvol_inum inum,
-                             u64 new_i_size,
-                             bool warn)
-{
-       struct btree_iter iter = {};
-       struct bch_inode_unpacked inode_u;
-       int ret;
-
-       ret   = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?:
-               (inode_u.bi_size = new_i_size, 0) ?:
-               bch2_inode_write(trans, &iter, &inode_u);
-
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
-                                           struct bkey_i *op_k,
-                                           u64 *i_sectors_delta)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter fpunch_iter;
-       struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
-       subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
-       u64 new_i_size = le64_to_cpu(op->v.new_i_size);
-       bool warn_errors = i_sectors_delta != NULL;
-       int ret;
-
-       ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL));
-       if (ret)
-               goto err;
-
-       bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
-                            POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
-                            BTREE_ITER_intent);
-       ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
-       bch2_trans_iter_exit(trans, &fpunch_iter);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               ret = 0;
-err:
-       if (warn_errors)
-               bch_err_fn(c, ret);
-       return ret;
-}
-
-int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
-{
-       return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
-}
-
-int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
-{
-       struct bkey_i_logged_op_truncate op;
-
-       bkey_logged_op_truncate_init(&op.k_i);
-       op.v.subvol     = cpu_to_le32(inum.subvol);
-       op.v.inum       = cpu_to_le64(inum.inum);
-       op.v.new_i_size = cpu_to_le64(new_i_size);
-
-       /*
-        * Logged ops aren't atomic w.r.t. snapshot creation: creating a
-        * snapshot while they're in progress, then crashing, will result in the
-        * resume only proceeding in one of the snapshots
-        */
-       down_read(&c->snapshot_create_lock);
-       struct btree_trans *trans = bch2_trans_get(c);
-       int ret = bch2_logged_op_start(trans, &op.k_i);
-       if (ret)
-               goto out;
-       ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta);
-       ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
-out:
-       bch2_trans_put(trans);
-       up_read(&c->snapshot_create_lock);
-
-       return ret;
-}
-
-/* finsert/fcollapse: */
-
-void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
-
-       prt_printf(out, "subvol=%u",            le32_to_cpu(op.v->subvol));
-       prt_printf(out, " inum=%llu",           le64_to_cpu(op.v->inum));
-       prt_printf(out, " dst_offset=%lli",     le64_to_cpu(op.v->dst_offset));
-       prt_printf(out, " src_offset=%llu",     le64_to_cpu(op.v->src_offset));
-}
-
-static int adjust_i_size(struct btree_trans *trans, subvol_inum inum,
-                        u64 offset, s64 len, bool warn)
-{
-       struct btree_iter iter;
-       struct bch_inode_unpacked inode_u;
-       int ret;
-
-       offset  <<= 9;
-       len     <<= 9;
-
-       ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn);
-       if (ret)
-               return ret;
-
-       if (len > 0) {
-               if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
-                       ret = -EFBIG;
-                       goto err;
-               }
-
-               if (offset >= inode_u.bi_size) {
-                       ret = -EINVAL;
-                       goto err;
-               }
-       }
-
-       inode_u.bi_size += len;
-       inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
-
-       ret = bch2_inode_write(trans, &iter, &inode_u);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
-                                          struct bkey_i *op_k,
-                                          u64 *i_sectors_delta)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
-       subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
-       struct bch_io_opts opts;
-       u64 dst_offset = le64_to_cpu(op->v.dst_offset);
-       u64 src_offset = le64_to_cpu(op->v.src_offset);
-       s64 shift = dst_offset - src_offset;
-       u64 len = abs(shift);
-       u64 pos = le64_to_cpu(op->v.pos);
-       bool insert = shift > 0;
-       u32 snapshot;
-       bool warn_errors = i_sectors_delta != NULL;
-       int ret = 0;
-
-       ret = bch2_inum_opts_get(trans, inum, &opts);
-       if (ret)
-               return ret;
-
-       /*
-        * check for missing subvolume before fpunch, as in resume we don't want
-        * it to be a fatal error
-        */
-       ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors));
-       if (ret)
-               return ret;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            POS(inum.inum, 0),
-                            BTREE_ITER_intent);
-
-       switch (op->v.state) {
-case LOGGED_OP_FINSERT_start:
-       op->v.state = LOGGED_OP_FINSERT_shift_extents;
-
-       if (insert) {
-               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                               adjust_i_size(trans, inum, src_offset, len, warn_errors) ?:
-                               bch2_logged_op_update(trans, &op->k_i));
-               if (ret)
-                       goto err;
-       } else {
-               bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset));
-
-               ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
-               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       goto err;
-
-               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                               bch2_logged_op_update(trans, &op->k_i));
-       }
-
-       fallthrough;
-case LOGGED_OP_FINSERT_shift_extents:
-       while (1) {
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(c, 0);
-               struct bkey_i delete, *copy;
-               struct bkey_s_c k;
-               struct bpos src_pos = POS(inum.inum, src_offset);
-
-               bch2_trans_begin(trans);
-
-               ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot,
-                                                   warn_errors);
-               if (ret)
-                       goto btree_err;
-
-               bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-               bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot));
-
-               k = insert
-                       ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0))
-                       : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX));
-               if ((ret = bkey_err(k)))
-                       goto btree_err;
-
-               if (!k.k ||
-                   k.k->p.inode != inum.inum ||
-                   bkey_le(k.k->p, POS(inum.inum, src_offset)))
-                       break;
-
-               copy = bch2_bkey_make_mut_noupdate(trans, k);
-               if ((ret = PTR_ERR_OR_ZERO(copy)))
-                       goto btree_err;
-
-               if (insert &&
-                   bkey_lt(bkey_start_pos(k.k), src_pos)) {
-                       bch2_cut_front(src_pos, copy);
-
-                       /* Splitting compressed extent? */
-                       bch2_disk_reservation_add(c, &disk_res,
-                                       copy->k.size *
-                                       bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
-                                       BCH_DISK_RESERVATION_NOFAIL);
-               }
-
-               bkey_init(&delete.k);
-               delete.k.p = copy->k.p;
-               delete.k.p.snapshot = snapshot;
-               delete.k.size = copy->k.size;
-
-               copy->k.p.offset += shift;
-               copy->k.p.snapshot = snapshot;
-
-               op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
-
-               ret =   bch2_bkey_set_needs_rebalance(c, &opts, copy) ?:
-                       bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
-                       bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
-                       bch2_logged_op_update(trans, &op->k_i) ?:
-                       bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
-btree_err:
-               bch2_disk_reservation_put(c, &disk_res);
-
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       goto err;
-
-               pos = le64_to_cpu(op->v.pos);
-       }
-
-       op->v.state = LOGGED_OP_FINSERT_finish;
-
-       if (!insert) {
-               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                               adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?:
-                               bch2_logged_op_update(trans, &op->k_i));
-       } else {
-               /* We need an inode update to update bi_journal_seq for fsync: */
-               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                               adjust_i_size(trans, inum, 0, 0, warn_errors) ?:
-                               bch2_logged_op_update(trans, &op->k_i));
-       }
-
-       break;
-case LOGGED_OP_FINSERT_finish:
-       break;
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       if (warn_errors)
-               bch_err_fn(c, ret);
-       return ret;
-}
-
-int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
-{
-       return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
-}
-
-int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
-                          u64 offset, u64 len, bool insert,
-                          s64 *i_sectors_delta)
-{
-       struct bkey_i_logged_op_finsert op;
-       s64 shift = insert ? len : -len;
-
-       bkey_logged_op_finsert_init(&op.k_i);
-       op.v.subvol     = cpu_to_le32(inum.subvol);
-       op.v.inum       = cpu_to_le64(inum.inum);
-       op.v.dst_offset = cpu_to_le64(offset + shift);
-       op.v.src_offset = cpu_to_le64(offset);
-       op.v.pos        = cpu_to_le64(insert ? U64_MAX : offset);
-
-       /*
-        * Logged ops aren't atomic w.r.t. snapshot creation: creating a
-        * snapshot while they're in progress, then crashing, will result in the
-        * resume only proceeding in one of the snapshots
-        */
-       down_read(&c->snapshot_create_lock);
-       struct btree_trans *trans = bch2_trans_get(c);
-       int ret = bch2_logged_op_start(trans, &op.k_i);
-       if (ret)
-               goto out;
-       ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta);
-       ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
-out:
-       bch2_trans_put(trans);
-       up_read(&c->snapshot_create_lock);
-
-       return ret;
-}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
deleted file mode 100644 (file)
index b93e4d4..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_MISC_H
-#define _BCACHEFS_IO_MISC_H
-
-int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
-                         u64, struct bch_io_opts, s64 *,
-                         struct write_point_specifier);
-
-int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos);
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-                  subvol_inum, u64, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-
-void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) {  \
-       .val_to_text    = bch2_logged_op_truncate_to_text,      \
-       .min_val_size   = 24,                                   \
-})
-
-int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
-
-int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
-
-void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {   \
-       .val_to_text    = bch2_logged_op_finsert_to_text,       \
-       .min_val_size   = 24,                                   \
-})
-
-int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
-
-int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
-
-#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
deleted file mode 100644 (file)
index 460e2e6..0000000
+++ /dev/null
@@ -1,1543 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Some low level IO code, and hacks for various block layer limitations
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "data_update.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "io_read.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "reflink.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/moduleparam.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_read_corrupt_ratio;
-module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
-MODULE_PARM_DESC(read_corrupt_ratio, "");
-#endif
-
-static bool bch2_poison_extents_on_checksum_error;
-module_param_named(poison_extents_on_checksum_error,
-                  bch2_poison_extents_on_checksum_error, bool, 0644);
-MODULE_PARM_DESC(poison_extents_on_checksum_error,
-                "Extents with checksum errors are marked as poisoned - unsafe without read fua support");
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-       const struct bch_devs_mask *devs;
-       unsigned d, nr = 0, total = 0;
-       u64 now = local_clock(), last;
-       s64 congested;
-       struct bch_dev *ca;
-
-       if (!target)
-               return false;
-
-       guard(rcu)();
-       devs = bch2_target_to_mask(c, target) ?:
-               &c->rw_devs[BCH_DATA_user];
-
-       for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
-               ca = rcu_dereference(c->devs[d]);
-               if (!ca)
-                       continue;
-
-               congested = atomic_read(&ca->congested);
-               last = READ_ONCE(ca->congested_last);
-               if (time_after64(now, last))
-                       congested -= (now - last) >> 12;
-
-               total += max(congested, 0LL);
-               nr++;
-       }
-
-       return get_random_u32_below(nr * CONGESTED_MAX) < total;
-}
-
-#else
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-       return false;
-}
-
-#endif
-
-/* Cache promotion on read */
-
-static const struct rhashtable_params bch_promote_params = {
-       .head_offset            = offsetof(struct promote_op, hash),
-       .key_offset             = offsetof(struct promote_op, pos),
-       .key_len                = sizeof(struct bpos),
-       .automatic_shrinking    = true,
-};
-
-static inline bool have_io_error(struct bch_io_failures *failed)
-{
-       return failed && failed->nr;
-}
-
-static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
-{
-       EBUG_ON(rbio->split);
-
-       return rbio->data_update
-               ? container_of(rbio, struct data_update, rbio)
-               : NULL;
-}
-
-static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
-{
-       struct data_update *u = rbio_data_update(orig);
-       if (!u)
-               return false;
-
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
-       unsigned i = 0;
-       bkey_for_each_ptr(ptrs, ptr) {
-               if (ptr->dev == dev &&
-                   u->data_opts.rewrite_ptrs & BIT(i))
-                       return true;
-               i++;
-       }
-
-       return false;
-}
-
-static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
-                                 struct bpos pos,
-                                 struct bch_io_opts opts,
-                                 unsigned flags,
-                                 struct bch_io_failures *failed)
-{
-       if (!have_io_error(failed)) {
-               BUG_ON(!opts.promote_target);
-
-               if (!(flags & BCH_READ_may_promote))
-                       return bch_err_throw(c, nopromote_may_not);
-
-               if (bch2_bkey_has_target(c, k, opts.promote_target))
-                       return bch_err_throw(c, nopromote_already_promoted);
-
-               if (bkey_extent_is_unwritten(k))
-                       return bch_err_throw(c, nopromote_unwritten);
-
-               if (bch2_target_congested(c, opts.promote_target))
-                       return bch_err_throw(c, nopromote_congested);
-       }
-
-       if (rhashtable_lookup_fast(&c->promote_table, &pos,
-                                  bch_promote_params))
-               return bch_err_throw(c, nopromote_in_flight);
-
-       return 0;
-}
-
-static noinline void promote_free(struct bch_read_bio *rbio)
-{
-       struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
-       struct bch_fs *c = rbio->c;
-
-       int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-                                        bch_promote_params);
-       BUG_ON(ret);
-
-       async_object_list_del(c, promote, op->list_idx);
-       async_object_list_del(c, rbio, rbio->list_idx);
-
-       bch2_data_update_exit(&op->write);
-
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
-       kfree_rcu(op, rcu);
-}
-
-static void promote_done(struct bch_write_op *wop)
-{
-       struct promote_op *op = container_of(wop, struct promote_op, write.op);
-       struct bch_fs *c = op->write.rbio.c;
-
-       bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
-       promote_free(&op->write.rbio);
-}
-
-static void promote_start_work(struct work_struct *work)
-{
-       struct promote_op *op = container_of(work, struct promote_op, work);
-
-       bch2_data_update_read_done(&op->write);
-}
-
-static noinline void promote_start(struct bch_read_bio *rbio)
-{
-       struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
-
-       trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
-
-       INIT_WORK(&op->work, promote_start_work);
-       queue_work(rbio->c->write_ref_wq, &op->work);
-}
-
-static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
-                                           enum btree_id btree_id,
-                                           struct bkey_s_c k,
-                                           struct bpos pos,
-                                           struct extent_ptr_decoded *pick,
-                                           unsigned sectors,
-                                           struct bch_read_bio *orig,
-                                           struct bch_io_failures *failed)
-{
-       struct bch_fs *c = trans->c;
-       int ret;
-
-       struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
-
-       if (!have_io_error(failed)) {
-               update_opts.target = orig->opts.promote_target;
-               update_opts.extra_replicas = 1;
-               update_opts.write_flags |= BCH_WRITE_cached;
-               update_opts.write_flags |= BCH_WRITE_only_specified_devs;
-       } else {
-               update_opts.target = orig->opts.foreground_target;
-
-               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-               unsigned ptr_bit = 1;
-               bkey_for_each_ptr(ptrs, ptr) {
-                       if (bch2_dev_io_failures(failed, ptr->dev) &&
-                           !ptr_being_rewritten(orig, ptr->dev))
-                               update_opts.rewrite_ptrs |= ptr_bit;
-                       ptr_bit <<= 1;
-               }
-
-               if (!update_opts.rewrite_ptrs)
-                       return NULL;
-       }
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote))
-               return ERR_PTR(-BCH_ERR_nopromote_no_writes);
-
-       struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
-       if (!op) {
-               ret = bch_err_throw(c, nopromote_enomem);
-               goto err_put;
-       }
-
-       op->start_time = local_clock();
-       op->pos = pos;
-
-       if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
-                                         bch_promote_params)) {
-               ret = bch_err_throw(c, nopromote_in_flight);
-               goto err;
-       }
-
-       ret = async_object_list_add(c, promote, op, &op->list_idx);
-       if (ret < 0)
-               goto err_remove_hash;
-
-       ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
-                       writepoint_hashed((unsigned long) current),
-                       &orig->opts,
-                       update_opts,
-                       btree_id, k);
-       op->write.type = BCH_DATA_UPDATE_promote;
-       /*
-        * possible errors: -BCH_ERR_nocow_lock_blocked,
-        * -BCH_ERR_ENOSPC_disk_reservation:
-        */
-       if (ret)
-               goto err_remove_list;
-
-       rbio_init_fragment(&op->write.rbio.bio, orig);
-       op->write.rbio.bounce   = true;
-       op->write.rbio.promote  = true;
-       op->write.op.end_io = promote_done;
-
-       return &op->write.rbio;
-err_remove_list:
-       async_object_list_del(c, promote, op->list_idx);
-err_remove_hash:
-       BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
-                                     bch_promote_params));
-err:
-       bio_free_pages(&op->write.op.wbio.bio);
-       /* We may have added to the rhashtable and thus need rcu freeing: */
-       kfree_rcu(op, rcu);
-err_put:
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
-       return ERR_PTR(ret);
-}
-
-noinline
-static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
-                                       struct bvec_iter iter,
-                                       struct bkey_s_c k,
-                                       struct extent_ptr_decoded *pick,
-                                       unsigned flags,
-                                       struct bch_read_bio *orig,
-                                       bool *bounce,
-                                       bool *read_full,
-                                       struct bch_io_failures *failed)
-{
-       /*
-        * We're in the retry path, but we don't know what to repair yet, and we
-        * don't want to do a promote here:
-        */
-       if (failed && !failed->nr)
-               return NULL;
-
-       struct bch_fs *c = trans->c;
-       /*
-        * if failed != NULL we're not actually doing a promote, we're
-        * recovering from an io/checksum error
-        */
-       bool promote_full = (have_io_error(failed) ||
-                            *read_full ||
-                            READ_ONCE(c->opts.promote_whole_extents));
-       /* data might have to be decompressed in the write path: */
-       unsigned sectors = promote_full
-               ? max(pick->crc.compressed_size, pick->crc.live_size)
-               : bvec_iter_sectors(iter);
-       struct bpos pos = promote_full
-               ? bkey_start_pos(k.k)
-               : POS(k.k->p.inode, iter.bi_sector);
-       int ret;
-
-       ret = should_promote(c, k, pos, orig->opts, flags, failed);
-       if (ret)
-               goto nopromote;
-
-       struct bch_read_bio *promote =
-               __promote_alloc(trans,
-                               k.k->type == KEY_TYPE_reflink_v
-                               ? BTREE_ID_reflink
-                               : BTREE_ID_extents,
-                               k, pos, pick, sectors, orig, failed);
-       if (!promote)
-               return NULL;
-
-       ret = PTR_ERR_OR_ZERO(promote);
-       if (ret)
-               goto nopromote;
-
-       *bounce         = true;
-       *read_full      = promote_full;
-
-       if (have_io_error(failed))
-               orig->self_healing = true;
-
-       return promote;
-nopromote:
-       trace_io_read_nopromote(c, ret);
-       return NULL;
-}
-
-void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op)
-{
-       if (!op->write.read_done) {
-               prt_printf(out, "parent read: %px\n", op->write.rbio.parent);
-               printbuf_indent_add(out, 2);
-               bch2_read_bio_to_text(out, op->write.rbio.parent);
-               printbuf_indent_sub(out, 2);
-       }
-
-       bch2_data_update_to_text(out, &op->write);
-}
-
-/* Read */
-
-static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
-                                  struct bch_read_bio *rbio, struct bpos read_pos)
-{
-       int ret = lockrestart_do(trans,
-               bch2_inum_offset_err_msg_trans(trans, out,
-                               (subvol_inum) { rbio->subvol, read_pos.inode },
-                               read_pos.offset << 9));
-       if (ret)
-               return ret;
-
-       if (rbio->data_update)
-               prt_str(out, "(internal move) ");
-
-       return 0;
-}
-
-static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
-                             struct bch_read_bio *rbio, struct bpos read_pos)
-{
-       bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
-}
-
-enum rbio_context {
-       RBIO_CONTEXT_NULL,
-       RBIO_CONTEXT_HIGHPRI,
-       RBIO_CONTEXT_UNBOUND,
-};
-
-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
-       return rbio->split ? rbio->parent : rbio;
-}
-
-__always_inline
-static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
-                          enum rbio_context context,
-                          struct workqueue_struct *wq)
-{
-       if (context <= rbio->context) {
-               fn(&rbio->work);
-       } else {
-               rbio->work.func         = fn;
-               rbio->context           = context;
-               queue_work(wq, &rbio->work);
-       }
-}
-
-static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
-{
-       BUG_ON(rbio->bounce && !rbio->split);
-
-       if (rbio->have_ioref) {
-               struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
-               enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
-       }
-
-       if (rbio->split) {
-               struct bch_read_bio *parent = rbio->parent;
-
-               if (unlikely(rbio->promote)) {
-                       if (!rbio->bio.bi_status)
-                               promote_start(rbio);
-                       else
-                               promote_free(rbio);
-               } else {
-                       async_object_list_del(rbio->c, rbio, rbio->list_idx);
-
-                       if (rbio->bounce)
-                               bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-
-                       bio_put(&rbio->bio);
-               }
-
-               rbio = parent;
-       }
-
-       return rbio;
-}
-
-/*
- * Only called on a top level bch_read_bio to complete an entire read request,
- * not a split:
- */
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
-       if (rbio->start_time)
-               bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
-                                      rbio->start_time);
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-       if (rbio->list_idx)
-               async_object_list_del(rbio->c, rbio, rbio->list_idx);
-#endif
-       bio_endio(&rbio->bio);
-}
-
-static void get_rbio_extent(struct btree_trans *trans,
-                           struct bch_read_bio *rbio,
-                           struct bkey_buf *sk)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = lockrestart_do(trans,
-                       bkey_err(k = bch2_bkey_get_iter(trans, &iter,
-                                               rbio->data_btree, rbio->data_pos, 0)));
-       if (ret)
-               return;
-
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       bkey_for_each_ptr(ptrs, ptr)
-               if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) {
-                       bch2_bkey_buf_reassemble(sk, trans->c, k);
-                       break;
-               }
-
-       bch2_trans_iter_exit(trans, &iter);
-}
-
-static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
-                                       enum btree_id btree, struct bkey_s_c read_k)
-{
-       if (!bch2_poison_extents_on_checksum_error)
-               return 0;
-
-       struct bch_fs *c = trans->c;
-
-       struct data_update *u = rbio_data_update(rbio);
-       if (u)
-               read_k = bkey_i_to_s_c(u->k.k);
-
-       u64 flags = bch2_bkey_extent_flags(read_k);
-       if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
-               return 0;
-
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k),
-                                              BTREE_ITER_intent);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       if (!bkey_and_val_eq(k, read_k))
-               goto out;
-
-       struct bkey_i *new = bch2_trans_kmalloc(trans,
-                                       bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
-       ret =   PTR_ERR_OR_ZERO(new) ?:
-               (bkey_reassemble(new, k), 0) ?:
-               bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
-               bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?:
-               bch2_trans_commit(trans, NULL, NULL, 0);
-
-       /*
-        * Propagate key change back to data update path, in particular so it
-        * knows the extent has been poisoned and it's safe to change the
-        * checksum
-        */
-       if (u && !ret)
-               bch2_bkey_buf_copy(&u->k, c, new);
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
-                                       struct bch_read_bio *rbio,
-                                       struct bvec_iter bvec_iter,
-                                       struct bch_io_failures *failed,
-                                       unsigned flags)
-{
-       struct data_update *u = container_of(rbio, struct data_update, rbio);
-retry:
-       bch2_trans_begin(trans);
-
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = lockrestart_do(trans,
-               bkey_err(k = bch2_bkey_get_iter(trans, &iter,
-                               u->btree_id, bkey_start_pos(&u->k.k->k),
-                               0)));
-       if (ret)
-               goto err;
-
-       if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
-               /* extent we wanted to read no longer exists: */
-               rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten);
-               goto err;
-       }
-
-       ret = __bch2_read_extent(trans, rbio, bvec_iter,
-                                bkey_start_pos(&u->k.k->k),
-                                u->btree_id,
-                                bkey_i_to_s_c(u->k.k),
-                                0, failed, flags, -1);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-           bch2_err_matches(ret, BCH_ERR_data_read_retry))
-               goto retry;
-
-       if (ret) {
-               rbio->bio.bi_status     = BLK_STS_IOERR;
-               rbio->ret               = ret;
-       }
-
-       BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
-       return ret;
-}
-
-static void bch2_rbio_retry(struct work_struct *work)
-{
-       struct bch_read_bio *rbio =
-               container_of(work, struct bch_read_bio, work);
-       struct bch_fs *c        = rbio->c;
-       struct bvec_iter iter   = rbio->bvec_iter;
-       unsigned flags          = rbio->flags;
-       subvol_inum inum = {
-               .subvol = rbio->subvol,
-               .inum   = rbio->read_pos.inode,
-       };
-       struct bch_io_failures failed = { .nr = 0 };
-
-       struct btree_trans *trans = bch2_trans_get(c);
-
-       struct bkey_buf sk;
-       bch2_bkey_buf_init(&sk);
-       bkey_init(&sk.k->k);
-
-       trace_io_read_retry(&rbio->bio);
-       this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
-                    bvec_iter_sectors(rbio->bvec_iter));
-
-       get_rbio_extent(trans, rbio, &sk);
-
-       if (!bkey_deleted(&sk.k->k) &&
-           bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
-               bch2_mark_io_failure(&failed, &rbio->pick,
-                                    rbio->ret == -BCH_ERR_data_read_retry_csum_err);
-
-       if (!rbio->split) {
-               rbio->bio.bi_status     = 0;
-               rbio->ret               = 0;
-       }
-
-       unsigned subvol         = rbio->subvol;
-       struct bpos read_pos    = rbio->read_pos;
-
-       rbio = bch2_rbio_free(rbio);
-
-       flags |= BCH_READ_in_retry;
-       flags &= ~BCH_READ_may_promote;
-       flags &= ~BCH_READ_last_fragment;
-       flags |= BCH_READ_must_clone;
-
-       int ret = rbio->data_update
-               ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
-               : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags);
-
-       if (ret) {
-               rbio->ret = ret;
-               rbio->bio.bi_status = BLK_STS_IOERR;
-       }
-
-       if (failed.nr || ret) {
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-
-               lockrestart_do(trans,
-                       bch2_inum_offset_err_msg_trans(trans, &buf,
-                                       (subvol_inum) { subvol, read_pos.inode },
-                                       read_pos.offset << 9));
-               if (rbio->data_update)
-                       prt_str(&buf, "(internal move) ");
-
-               prt_str(&buf, "data read error, ");
-               if (!ret) {
-                       prt_str(&buf, "successful retry");
-                       if (rbio->self_healing)
-                               prt_str(&buf, ", self healing");
-               } else
-                       prt_str(&buf, bch2_err_str(ret));
-               prt_newline(&buf);
-
-
-               if (!bkey_deleted(&sk.k->k)) {
-                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k));
-                       prt_newline(&buf);
-               }
-
-               bch2_io_failures_to_text(&buf, c, &failed);
-
-               bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       bch2_rbio_done(rbio);
-       bch2_bkey_buf_exit(&sk, c);
-       bch2_trans_put(trans);
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio,
-                           int ret, blk_status_t blk_error)
-{
-       BUG_ON(ret >= 0);
-
-       rbio->ret               = ret;
-       rbio->bio.bi_status     = blk_error;
-
-       bch2_rbio_parent(rbio)->saw_error = true;
-
-       if (rbio->flags & BCH_READ_in_retry)
-               return;
-
-       if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
-               bch2_rbio_punt(rbio, bch2_rbio_retry,
-                              RBIO_CONTEXT_UNBOUND, system_dfl_wq);
-       } else {
-               rbio = bch2_rbio_free(rbio);
-
-               rbio->ret               = ret;
-               rbio->bio.bi_status     = blk_error;
-
-               bch2_rbio_done(rbio);
-       }
-}
-
-static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
-                                  struct bch_read_bio *rbio)
-{
-       struct bch_fs *c = rbio->c;
-       u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
-       struct bch_extent_crc_unpacked new_crc;
-       struct btree_iter iter;
-       struct bkey_i *new;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       if (crc_is_compressed(rbio->pick.crc))
-               return 0;
-
-       k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
-                              BTREE_ITER_slots|BTREE_ITER_intent);
-       if ((ret = bkey_err(k)))
-               goto out;
-
-       if (bversion_cmp(k.k->bversion, rbio->version) ||
-           !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
-               goto out;
-
-       /* Extent was merged? */
-       if (bkey_start_offset(k.k) < data_offset ||
-           k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
-               goto out;
-
-       if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
-                       rbio->pick.crc, NULL, &new_crc,
-                       bkey_start_offset(k.k) - data_offset, k.k->size,
-                       rbio->pick.crc.csum_type)) {
-               bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
-               ret = 0;
-               goto out;
-       }
-
-       /*
-        * going to be temporarily appending another checksum entry:
-        */
-       new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-                                sizeof(struct bch_extent_crc128));
-       if ((ret = PTR_ERR_OR_ZERO(new)))
-               goto out;
-
-       bkey_reassemble(new, k);
-
-       if (!bch2_bkey_narrow_crcs(new, new_crc))
-               goto out;
-
-       ret = bch2_trans_update(trans, &iter, new,
-                               BTREE_UPDATE_internal_snapshot_node);
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
-{
-       bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                            __bch2_rbio_narrow_crcs(trans, rbio));
-}
-
-static void bch2_read_decompress_err(struct work_struct *work)
-{
-       struct bch_read_bio *rbio =
-               container_of(work, struct bch_read_bio, work);
-       struct bch_fs *c        = rbio->c;
-       struct printbuf buf = PRINTBUF;
-
-       bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
-       prt_str(&buf, "decompression error");
-
-       struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-       if (ca)
-               bch_err_ratelimited(ca, "%s", buf.buf);
-       else
-               bch_err_ratelimited(c, "%s", buf.buf);
-
-       bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
-       printbuf_exit(&buf);
-}
-
-static void bch2_read_decrypt_err(struct work_struct *work)
-{
-       struct bch_read_bio *rbio =
-               container_of(work, struct bch_read_bio, work);
-       struct bch_fs *c        = rbio->c;
-       struct printbuf buf = PRINTBUF;
-
-       bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
-       prt_str(&buf, "decrypt error");
-
-       struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-       if (ca)
-               bch_err_ratelimited(ca, "%s", buf.buf);
-       else
-               bch_err_ratelimited(c, "%s", buf.buf);
-
-       bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
-       printbuf_exit(&buf);
-}
-
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
-       struct bch_read_bio *rbio =
-               container_of(work, struct bch_read_bio, work);
-       struct bch_fs *c        = rbio->c;
-       struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-       struct bch_read_bio *parent     = bch2_rbio_parent(rbio);
-       struct bio *src                 = &rbio->bio;
-       struct bio *dst                 = &parent->bio;
-       struct bvec_iter dst_iter       = rbio->bvec_iter;
-       struct bch_extent_crc_unpacked crc = rbio->pick.crc;
-       struct nonce nonce = extent_nonce(rbio->version, crc);
-       unsigned nofs_flags;
-       struct bch_csum csum;
-       int ret;
-
-       nofs_flags = memalloc_nofs_save();
-
-       /* Reset iterator for checksumming and copying bounced data: */
-       if (rbio->bounce) {
-               src->bi_iter.bi_size            = crc.compressed_size << 9;
-               src->bi_iter.bi_idx             = 0;
-               src->bi_iter.bi_bvec_done       = 0;
-       } else {
-               src->bi_iter                    = rbio->bvec_iter;
-       }
-
-       bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
-
-       csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
-       bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
-
-       /*
-        * Checksum error: if the bio wasn't bounced, we may have been
-        * reading into buffers owned by userspace (that userspace can
-        * scribble over) - retry the read, bouncing it this time:
-        */
-       if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
-               rbio->flags |= BCH_READ_must_bounce;
-               bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
-                               BLK_STS_IOERR);
-               goto out;
-       }
-
-       bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
-
-       if (!csum_good)
-               goto csum_err;
-
-       /*
-        * XXX
-        * We need to rework the narrow_crcs path to deliver the read completion
-        * first, and then punt to a different workqueue, otherwise we're
-        * holding up reads while doing btree updates which is bad for memory
-        * reclaim.
-        */
-       if (unlikely(rbio->narrow_crcs))
-               bch2_rbio_narrow_crcs(rbio);
-
-       if (likely(!parent->data_update)) {
-               /* Adjust crc to point to subset of data we want: */
-               crc.offset     += rbio->offset_into_extent;
-               crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
-
-               if (crc_is_compressed(crc)) {
-                       ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-                       if (ret)
-                               goto decrypt_err;
-
-                       if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
-                           !c->opts.no_data_io)
-                               goto decompression_err;
-               } else {
-                       /* don't need to decrypt the entire bio: */
-                       nonce = nonce_add(nonce, crc.offset << 9);
-                       bio_advance(src, crc.offset << 9);
-
-                       BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
-                       src->bi_iter.bi_size = dst_iter.bi_size;
-
-                       ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-                       if (ret)
-                               goto decrypt_err;
-
-                       if (rbio->bounce) {
-                               struct bvec_iter src_iter = src->bi_iter;
-
-                               bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-                       }
-               }
-       } else {
-               if (rbio->split)
-                       rbio->parent->pick = rbio->pick;
-
-               if (rbio->bounce) {
-                       struct bvec_iter src_iter = src->bi_iter;
-
-                       bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-               }
-       }
-
-       if (rbio->promote) {
-               /*
-                * Re encrypt data we decrypted, so it's consistent with
-                * rbio->crc:
-                */
-               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-               if (ret)
-                       goto decrypt_err;
-       }
-
-       if (likely(!(rbio->flags & BCH_READ_in_retry))) {
-               rbio = bch2_rbio_free(rbio);
-               bch2_rbio_done(rbio);
-       }
-out:
-       memalloc_nofs_restore(nofs_flags);
-       return;
-csum_err:
-       bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
-       goto out;
-decompression_err:
-       bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_dfl_wq);
-       goto out;
-decrypt_err:
-       bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_dfl_wq);
-       goto out;
-}
-
-static void bch2_read_endio(struct bio *bio)
-{
-       struct bch_read_bio *rbio =
-               container_of(bio, struct bch_read_bio, bio);
-       struct bch_fs *c        = rbio->c;
-       struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-       struct workqueue_struct *wq = NULL;
-       enum rbio_context context = RBIO_CONTEXT_NULL;
-
-       bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
-                                  rbio->submit_time, !bio->bi_status);
-
-       if (!rbio->split)
-               rbio->bio.bi_end_io = rbio->end_io;
-
-       if (unlikely(bio->bi_status)) {
-               bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
-               return;
-       }
-
-       if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
-           (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
-               trace_and_count(c, io_read_reuse_race, &rbio->bio);
-
-               if (rbio->flags & BCH_READ_retry_if_stale)
-                       bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
-               else
-                       bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
-               return;
-       }
-
-       if (rbio->narrow_crcs ||
-           rbio->promote ||
-           crc_is_compressed(rbio->pick.crc) ||
-           bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
-               context = RBIO_CONTEXT_UNBOUND, wq = system_dfl_wq;
-       else if (rbio->pick.crc.csum_type)
-               context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
-
-       bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
-}
-
-static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
-                                                  struct bch_dev *ca,
-                                                  struct bkey_s_c k,
-                                                  struct bch_extent_ptr ptr)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct printbuf buf = PRINTBUF;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-                            PTR_BUCKET_POS(ca, &ptr),
-                            BTREE_ITER_cached);
-
-       int gen = bucket_gen_get(ca, iter.pos.offset);
-       if (gen >= 0) {
-               prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
-               printbuf_indent_add(&buf, 2);
-
-               bch2_bkey_val_to_text(&buf, c, k);
-               prt_newline(&buf);
-
-               prt_printf(&buf, "memory gen: %u", gen);
-
-               ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
-               if (!ret) {
-                       prt_newline(&buf);
-                       bch2_bkey_val_to_text(&buf, c, k);
-               }
-       } else {
-               prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
-                          iter.pos.inode, iter.pos.offset);
-               printbuf_indent_add(&buf, 2);
-
-               prt_printf(&buf, "first bucket %u nbuckets %llu\n",
-                          ca->mi.first_bucket, ca->mi.nbuckets);
-
-               bch2_bkey_val_to_text(&buf, c, k);
-               prt_newline(&buf);
-       }
-
-       bch2_fs_inconsistent(c, "%s", buf.buf);
-
-       bch2_trans_iter_exit(trans, &iter);
-       printbuf_exit(&buf);
-}
-
-int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
-                      struct bvec_iter iter, struct bpos read_pos,
-                      enum btree_id data_btree, struct bkey_s_c k,
-                      unsigned offset_into_extent,
-                      struct bch_io_failures *failed, unsigned flags, int dev)
-{
-       struct bch_fs *c = trans->c;
-       struct extent_ptr_decoded pick;
-       struct bch_read_bio *rbio = NULL;
-       bool bounce = false, read_full = false, narrow_crcs = false;
-       struct bpos data_pos = bkey_start_pos(k.k);
-       struct data_update *u = rbio_data_update(orig);
-       int ret = 0;
-
-       if (bkey_extent_is_inline_data(k.k)) {
-               unsigned bytes = min_t(unsigned, iter.bi_size,
-                                      bkey_inline_data_bytes(k.k));
-
-               swap(iter.bi_size, bytes);
-               memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
-               swap(iter.bi_size, bytes);
-               bio_advance_iter(&orig->bio, &iter, bytes);
-               zero_fill_bio_iter(&orig->bio, iter);
-               this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
-                            bvec_iter_sectors(iter));
-               goto out_read_done;
-       }
-
-       if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) &&
-           !orig->data_update)
-               return bch_err_throw(c, extent_poisoned);
-retry_pick:
-       ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
-
-       /* hole or reservation - just zero fill: */
-       if (!ret)
-               goto hole;
-
-       if (unlikely(ret < 0)) {
-               if (ret == -BCH_ERR_data_read_csum_err) {
-                       int ret2 = maybe_poison_extent(trans, orig, data_btree, k);
-                       if (ret2) {
-                               ret = ret2;
-                               goto err;
-                       }
-
-                       trace_and_count(c, io_read_fail_and_poison, &orig->bio);
-               }
-
-               struct printbuf buf = PRINTBUF;
-               bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
-               prt_printf(&buf, "%s\n  ", bch2_err_str(ret));
-               bch2_bkey_val_to_text(&buf, c, k);
-
-               bch_err_ratelimited(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-               goto err;
-       }
-
-       if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
-           !c->chacha20_key_set) {
-               struct printbuf buf = PRINTBUF;
-               bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
-               prt_printf(&buf, "attempting to read encrypted data without encryption key\n  ");
-               bch2_bkey_val_to_text(&buf, c, k);
-
-               bch_err_ratelimited(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-               ret = bch_err_throw(c, data_read_no_encryption_key);
-               goto err;
-       }
-
-       struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
-                                       BCH_DEV_READ_REF_io_read);
-
-       /*
-        * Stale dirty pointers are treated as IO errors, but @failed isn't
-        * allocated unless we're in the retry path - so if we're not in the
-        * retry path, don't check here, it'll be caught in bch2_read_endio()
-        * and we'll end up in the retry path:
-        */
-       if ((flags & BCH_READ_in_retry) &&
-           !pick.ptr.cached &&
-           ca &&
-           unlikely(dev_ptr_stale(ca, &pick.ptr))) {
-               read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
-               bch2_mark_io_failure(failed, &pick, false);
-               enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
-               goto retry_pick;
-       }
-
-       if (likely(!u)) {
-               if (!(flags & BCH_READ_last_fragment) ||
-                   bio_flagged(&orig->bio, BIO_CHAIN))
-                       flags |= BCH_READ_must_clone;
-
-               narrow_crcs = !(flags & BCH_READ_in_retry) &&
-                       bch2_can_narrow_extent_crcs(k, pick.crc);
-
-               if (narrow_crcs && (flags & BCH_READ_user_mapped))
-                       flags |= BCH_READ_must_bounce;
-
-               EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
-               if (crc_is_compressed(pick.crc) ||
-                   (pick.crc.csum_type != BCH_CSUM_none &&
-                    (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-                     (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
-                      (flags & BCH_READ_user_mapped)) ||
-                     (flags & BCH_READ_must_bounce)))) {
-                       read_full = true;
-                       bounce = true;
-               }
-       } else {
-               /*
-                * can happen if we retry, and the extent we were going to read
-                * has been merged in the meantime:
-                */
-               if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
-                       if (ca)
-                               enumerated_ref_put(&ca->io_ref[READ],
-                                       BCH_DEV_READ_REF_io_read);
-                       rbio->ret = bch_err_throw(c, data_read_buffer_too_small);
-                       goto out_read_done;
-               }
-
-               iter.bi_size    = pick.crc.compressed_size << 9;
-               read_full = true;
-       }
-
-       if (orig->opts.promote_target || have_io_error(failed))
-               rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
-                                    &bounce, &read_full, failed);
-
-       if (!read_full) {
-               EBUG_ON(crc_is_compressed(pick.crc));
-               EBUG_ON(pick.crc.csum_type &&
-                       (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-                        bvec_iter_sectors(iter) != pick.crc.live_size ||
-                        pick.crc.offset ||
-                        offset_into_extent));
-
-               data_pos.offset += offset_into_extent;
-               pick.ptr.offset += pick.crc.offset +
-                       offset_into_extent;
-               offset_into_extent              = 0;
-               pick.crc.compressed_size        = bvec_iter_sectors(iter);
-               pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
-               pick.crc.offset                 = 0;
-               pick.crc.live_size              = bvec_iter_sectors(iter);
-       }
-
-       if (rbio) {
-               /*
-                * promote already allocated bounce rbio:
-                * promote needs to allocate a bio big enough for uncompressing
-                * data in the write path, but we're not going to use it all
-                * here:
-                */
-               EBUG_ON(rbio->bio.bi_iter.bi_size <
-                      pick.crc.compressed_size << 9);
-               rbio->bio.bi_iter.bi_size =
-                       pick.crc.compressed_size << 9;
-       } else if (bounce) {
-               unsigned sectors = pick.crc.compressed_size;
-
-               rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
-                                                 DIV_ROUND_UP(sectors, PAGE_SECTORS),
-                                                 0,
-                                                 GFP_NOFS,
-                                                 &c->bio_read_split),
-                                orig);
-
-               bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
-               rbio->bounce    = true;
-       } else if (flags & BCH_READ_must_clone) {
-               /*
-                * Have to clone if there were any splits, due to error
-                * reporting issues (if a split errored, and retrying didn't
-                * work, when it reports the error to its parent (us) we don't
-                * know if the error was from our bio, and we should retry, or
-                * from the whole bio, in which case we don't want to retry and
-                * lose the error)
-                */
-               rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
-                                                &c->bio_read_split),
-                                orig);
-               rbio->bio.bi_iter = iter;
-       } else {
-               rbio = orig;
-               rbio->bio.bi_iter = iter;
-               EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
-       }
-
-       EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
-
-       rbio->submit_time       = local_clock();
-       if (!rbio->split)
-               rbio->end_io    = orig->bio.bi_end_io;
-       rbio->bvec_iter         = iter;
-       rbio->offset_into_extent= offset_into_extent;
-       rbio->flags             = flags;
-       rbio->have_ioref        = ca != NULL;
-       rbio->narrow_crcs       = narrow_crcs;
-       rbio->ret               = 0;
-       rbio->context           = 0;
-       rbio->pick              = pick;
-       rbio->subvol            = orig->subvol;
-       rbio->read_pos          = read_pos;
-       rbio->data_btree        = data_btree;
-       rbio->data_pos          = data_pos;
-       rbio->version           = k.k->bversion;
-       INIT_WORK(&rbio->work, NULL);
-
-       rbio->bio.bi_opf        = orig->bio.bi_opf;
-       rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
-       rbio->bio.bi_end_io     = bch2_read_endio;
-
-       async_object_list_add(c, rbio, rbio, &rbio->list_idx);
-
-       if (rbio->bounce)
-               trace_and_count(c, io_read_bounce, &rbio->bio);
-
-       if (!u)
-               this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
-       else
-               this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
-       bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-
-       /*
-        * If it's being moved internally, we don't want to flag it as a cache
-        * hit:
-        */
-       if (ca && pick.ptr.cached && !u)
-               bch2_bucket_io_time_reset(trans, pick.ptr.dev,
-                       PTR_BUCKET_NR(ca, &pick.ptr), READ);
-
-       if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
-               bio_inc_remaining(&orig->bio);
-               trace_and_count(c, io_read_split, &orig->bio);
-       }
-
-       /*
-        * Unlock the iterator while the btree node's lock is still in
-        * cache, before doing the IO:
-        */
-       if (!(flags & BCH_READ_in_retry))
-               bch2_trans_unlock(trans);
-       else
-               bch2_trans_unlock_long(trans);
-
-       if (likely(!rbio->pick.do_ec_reconstruct)) {
-               if (unlikely(!rbio->have_ioref)) {
-                       bch2_rbio_error(rbio,
-                                       -BCH_ERR_data_read_retry_device_offline,
-                                       BLK_STS_IOERR);
-                       goto out;
-               }
-
-               this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
-                            bio_sectors(&rbio->bio));
-               bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
-               if (unlikely(c->opts.no_data_io)) {
-                       if (likely(!(flags & BCH_READ_in_retry)))
-                               bio_endio(&rbio->bio);
-               } else {
-                       if (likely(!(flags & BCH_READ_in_retry)))
-                               submit_bio(&rbio->bio);
-                       else
-                               submit_bio_wait(&rbio->bio);
-               }
-
-               /*
-                * We just submitted IO which may block, we expect relock fail
-                * events and shouldn't count them:
-                */
-               trans->notrace_relock_fail = true;
-       } else {
-               /* Attempting reconstruct read: */
-               if (bch2_ec_read_extent(trans, rbio, k)) {
-                       bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
-                                       BLK_STS_IOERR);
-                       goto out;
-               }
-
-               if (likely(!(flags & BCH_READ_in_retry)))
-                       bio_endio(&rbio->bio);
-       }
-out:
-       if (likely(!(flags & BCH_READ_in_retry))) {
-               return 0;
-       } else {
-               bch2_trans_unlock(trans);
-
-               int ret;
-
-               rbio->context = RBIO_CONTEXT_UNBOUND;
-               bch2_read_endio(&rbio->bio);
-
-               ret = rbio->ret;
-               rbio = bch2_rbio_free(rbio);
-
-               if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
-                       bch2_mark_io_failure(failed, &pick,
-                                       ret == -BCH_ERR_data_read_retry_csum_err);
-
-               return ret;
-       }
-
-err:
-       if (flags & BCH_READ_in_retry)
-               return ret;
-
-       orig->bio.bi_status     = BLK_STS_IOERR;
-       orig->ret               = ret;
-       goto out_read_done;
-
-hole:
-       this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
-                    bvec_iter_sectors(iter));
-       /*
-        * won't normally happen in the data update (bch2_move_extent()) path,
-        * but if we retry and the extent we wanted to read no longer exists we
-        * have to signal that:
-        */
-       if (u)
-               orig->ret = bch_err_throw(c, data_read_key_overwritten);
-
-       zero_fill_bio_iter(&orig->bio, iter);
-out_read_done:
-       if ((flags & BCH_READ_last_fragment) &&
-           !(flags & BCH_READ_in_retry))
-               bch2_rbio_done(orig);
-       return 0;
-}
-
-int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
-               struct bvec_iter bvec_iter, subvol_inum inum,
-               struct bch_io_failures *failed,
-               struct bkey_buf *prev_read,
-               unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_buf sk;
-       struct bkey_s_c k;
-       enum btree_id data_btree;
-       int ret;
-
-       EBUG_ON(rbio->data_update);
-
-       bch2_bkey_buf_init(&sk);
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            POS(inum.inum, bvec_iter.bi_sector),
-                            BTREE_ITER_slots);
-
-       while (1) {
-               data_btree = BTREE_ID_extents;
-
-               bch2_trans_begin(trans);
-
-               u32 snapshot;
-               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-               if (ret)
-                       goto err;
-
-               bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
-               bch2_btree_iter_set_pos(trans, &iter,
-                               POS(inum.inum, bvec_iter.bi_sector));
-
-               k = bch2_btree_iter_peek_slot(trans, &iter);
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               s64 offset_into_extent = iter.pos.offset -
-                       bkey_start_offset(k.k);
-               unsigned sectors = k.k->size - offset_into_extent;
-
-               bch2_bkey_buf_reassemble(&sk, c, k);
-
-               ret = bch2_read_indirect_extent(trans, &data_btree,
-                                       &offset_into_extent, &sk);
-               if (ret)
-                       goto err;
-
-               k = bkey_i_to_s_c(sk.k);
-
-               if (unlikely(flags & BCH_READ_in_retry)) {
-                       if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k)))
-                               failed->nr = 0;
-                       bch2_bkey_buf_copy(prev_read, c, sk.k);
-               }
-
-               /*
-                * With indirect extents, the amount of data to read is the min
-                * of the original extent and the indirect extent:
-                */
-               sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
-
-               unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
-               swap(bvec_iter.bi_size, bytes);
-
-               if (bvec_iter.bi_size == bytes)
-                       flags |= BCH_READ_last_fragment;
-
-               ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
-                                        data_btree, k,
-                                        offset_into_extent, failed, flags, -1);
-               swap(bvec_iter.bi_size, bytes);
-
-               if (ret)
-                       goto err;
-
-               if (flags & BCH_READ_last_fragment)
-                       break;
-
-               bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-err:
-               if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
-                       flags |= BCH_READ_must_bounce;
-
-               if (ret &&
-                   !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-                   !bch2_err_matches(ret, BCH_ERR_data_read_retry))
-                       break;
-       }
-
-       if (unlikely(ret)) {
-               if (ret != -BCH_ERR_extent_poisoned) {
-                       struct printbuf buf = PRINTBUF;
-                       lockrestart_do(trans,
-                                      bch2_inum_offset_err_msg_trans(trans, &buf, inum,
-                                                                     bvec_iter.bi_sector << 9));
-                       prt_printf(&buf, "data read error: %s", bch2_err_str(ret));
-                       bch_err_ratelimited(c, "%s", buf.buf);
-                       printbuf_exit(&buf);
-               }
-
-               rbio->bio.bi_status     = BLK_STS_IOERR;
-               rbio->ret               = ret;
-
-               if (!(flags & BCH_READ_in_retry))
-                       bch2_rbio_done(rbio);
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_bkey_buf_exit(&sk, c);
-       return ret;
-}
-
-static const char * const bch2_read_bio_flags[] = {
-#define x(n)   #n,
-       BCH_READ_FLAGS()
-#undef x
-       NULL
-};
-
-void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio)
-{
-       u64 now = local_clock();
-       prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0);
-       prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0);
-
-       if (!rbio->split)
-               prt_printf(out, "end_io:\t%ps\n", rbio->end_io);
-       else
-               prt_printf(out, "parent:\t%px\n", rbio->parent);
-
-       prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io);
-
-       prt_printf(out, "promote:\t%u\n",       rbio->promote);
-       prt_printf(out, "bounce:\t%u\n",        rbio->bounce);
-       prt_printf(out, "split:\t%u\n",         rbio->split);
-       prt_printf(out, "have_ioref:\t%u\n",    rbio->have_ioref);
-       prt_printf(out, "narrow_crcs:\t%u\n",   rbio->narrow_crcs);
-       prt_printf(out, "context:\t%u\n",       rbio->context);
-
-       int ret = READ_ONCE(rbio->ret);
-       if (ret < 0)
-               prt_printf(out, "ret:\t%s\n",           bch2_err_str(ret));
-       else
-               prt_printf(out, "ret:\t%i\n",           ret);
-
-       prt_printf(out, "flags:\t");
-       bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags);
-       prt_newline(out);
-
-       bch2_bio_to_text(out, &rbio->bio);
-}
-
-void bch2_fs_io_read_exit(struct bch_fs *c)
-{
-       if (c->promote_table.tbl)
-               rhashtable_destroy(&c->promote_table);
-       bioset_exit(&c->bio_read_split);
-       bioset_exit(&c->bio_read);
-       mempool_exit(&c->bio_bounce_pages);
-}
-
-int bch2_fs_io_read_init(struct bch_fs *c)
-{
-       if (mempool_init_page_pool(&c->bio_bounce_pages,
-                                  max_t(unsigned,
-                                        c->opts.btree_node_size,
-                                        c->opts.encoded_extent_max) /
-                                  PAGE_SIZE, 0))
-               return bch_err_throw(c, ENOMEM_bio_bounce_pages_init);
-
-       if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-                       BIOSET_NEED_BVECS))
-               return bch_err_throw(c, ENOMEM_bio_read_init);
-
-       if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-                       BIOSET_NEED_BVECS))
-               return bch_err_throw(c, ENOMEM_bio_read_split_init);
-
-       if (rhashtable_init(&c->promote_table, &bch_promote_params))
-               return bch_err_throw(c, ENOMEM_promote_table_init);
-
-       return 0;
-}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
deleted file mode 100644 (file)
index 9c5ddbf..0000000
+++ /dev/null
@@ -1,216 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_READ_H
-#define _BCACHEFS_IO_READ_H
-
-#include "bkey_buf.h"
-#include "btree_iter.h"
-#include "extents_types.h"
-#include "reflink.h"
-
-struct bch_read_bio {
-       struct bch_fs           *c;
-       u64                     start_time;
-       u64                     submit_time;
-
-       /*
-        * Reads will often have to be split, and if the extent being read from
-        * was checksummed or compressed we'll also have to allocate bounce
-        * buffers and copy the data back into the original bio.
-        *
-        * If we didn't have to split, we have to save and restore the original
-        * bi_end_io - @split below indicates which:
-        */
-       union {
-       struct bch_read_bio     *parent;
-       bio_end_io_t            *end_io;
-       };
-
-       /*
-        * Saved copy of bio->bi_iter, from submission time - allows us to
-        * resubmit on IO error, and also to copy data back to the original bio
-        * when we're bouncing:
-        */
-       struct bvec_iter        bvec_iter;
-
-       unsigned                offset_into_extent;
-
-       u16                     flags;
-       union {
-       struct {
-       u16                     data_update:1,
-                               promote:1,
-                               bounce:1,
-                               split:1,
-                               have_ioref:1,
-                               narrow_crcs:1,
-                               saw_error:1,
-                               self_healing:1,
-                               context:2;
-       };
-       u16                     _state;
-       };
-       s16                     ret;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-       unsigned                list_idx;
-#endif
-
-       struct extent_ptr_decoded pick;
-
-       /*
-        * pos we read from - different from data_pos for indirect extents:
-        */
-       u32                     subvol;
-       struct bpos             read_pos;
-
-       /*
-        * start pos of data we read (may not be pos of data we want) - for
-        * promote, narrow extents paths:
-        */
-       enum btree_id           data_btree;
-       struct bpos             data_pos;
-       struct bversion         version;
-
-       struct bch_io_opts      opts;
-
-       struct work_struct      work;
-
-       struct bio              bio;
-};
-
-#define to_rbio(_bio)          container_of((_bio), struct bch_read_bio, bio)
-
-struct bch_devs_mask;
-struct cache_promote_op;
-struct extent_ptr_decoded;
-
-static inline int bch2_read_indirect_extent(struct btree_trans *trans,
-                                           enum btree_id *data_btree,
-                                           s64 *offset_into_extent,
-                                           struct bkey_buf *extent)
-{
-       if (extent->k->k.type != KEY_TYPE_reflink_p)
-               return 0;
-
-       *data_btree = BTREE_ID_reflink;
-
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
-                                               offset_into_extent,
-                                               bkey_i_to_s_c_reflink_p(extent->k),
-                                               true, 0);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       if (bkey_deleted(k.k)) {
-               bch2_trans_iter_exit(trans, &iter);
-               return bch_err_throw(c, missing_indirect_extent);
-       }
-
-       bch2_bkey_buf_reassemble(extent, c, k);
-       bch2_trans_iter_exit(trans, &iter);
-       return 0;
-}
-
-#define BCH_READ_FLAGS()               \
-       x(retry_if_stale)               \
-       x(may_promote)                  \
-       x(user_mapped)                  \
-       x(last_fragment)                \
-       x(must_bounce)                  \
-       x(must_clone)                   \
-       x(in_retry)
-
-enum __bch_read_flags {
-#define x(n)   __BCH_READ_##n,
-       BCH_READ_FLAGS()
-#undef x
-};
-
-enum bch_read_flags {
-#define x(n)   BCH_READ_##n = BIT(__BCH_READ_##n),
-       BCH_READ_FLAGS()
-#undef x
-};
-
-int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
-                      struct bvec_iter, struct bpos, enum btree_id,
-                      struct bkey_s_c, unsigned,
-                      struct bch_io_failures *, unsigned, int);
-
-static inline void bch2_read_extent(struct btree_trans *trans,
-                       struct bch_read_bio *rbio, struct bpos read_pos,
-                       enum btree_id data_btree, struct bkey_s_c k,
-                       unsigned offset_into_extent, unsigned flags)
-{
-       int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
-                                    data_btree, k, offset_into_extent, NULL, flags, -1);
-       /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */
-       WARN(ret, "unhandled error from __bch2_read_extent()");
-}
-
-int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
-               subvol_inum,
-               struct bch_io_failures *, struct bkey_buf *, unsigned flags);
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-                            subvol_inum inum)
-{
-       BUG_ON(rbio->_state);
-
-       rbio->subvol = inum.subvol;
-
-       bch2_trans_run(c,
-               __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL,
-                           BCH_READ_retry_if_stale|
-                           BCH_READ_may_promote|
-                           BCH_READ_user_mapped));
-}
-
-static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
-                                                     struct bch_read_bio *orig)
-{
-       struct bch_read_bio *rbio = to_rbio(bio);
-
-       rbio->c                 = orig->c;
-       rbio->_state            = 0;
-       rbio->flags             = 0;
-       rbio->ret               = 0;
-       rbio->split             = true;
-       rbio->parent            = orig;
-       rbio->opts              = orig->opts;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-       rbio->list_idx  = 0;
-#endif
-       return rbio;
-}
-
-static inline struct bch_read_bio *rbio_init(struct bio *bio,
-                                            struct bch_fs *c,
-                                            struct bch_io_opts opts,
-                                            bio_end_io_t end_io)
-{
-       struct bch_read_bio *rbio = to_rbio(bio);
-
-       rbio->start_time        = local_clock();
-       rbio->c                 = c;
-       rbio->_state            = 0;
-       rbio->flags             = 0;
-       rbio->ret               = 0;
-       rbio->opts              = opts;
-       rbio->bio.bi_end_io     = end_io;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-       rbio->list_idx  = 0;
-#endif
-       return rbio;
-}
-
-struct promote_op;
-void bch2_promote_op_to_text(struct printbuf *, struct promote_op *);
-void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *);
-
-void bch2_fs_io_read_exit(struct bch_fs *);
-int bch2_fs_io_read_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_READ_H */
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
deleted file mode 100644 (file)
index 88b1eec..0000000
+++ /dev/null
@@ -1,1780 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "debug.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io_write.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/blkdev.h>
-#include <linux/prefetch.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_write_corrupt_ratio;
-module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
-MODULE_PARM_DESC(write_corrupt_ratio, "");
-#endif
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
-                                      u64 now, int rw)
-{
-       u64 latency_capable =
-               ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
-       /* ideally we'd be taking into account the device's variance here: */
-       u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
-       s64 latency_over = io_latency - latency_threshold;
-
-       if (latency_threshold && latency_over > 0) {
-               /*
-                * bump up congested by approximately latency_over * 4 /
-                * latency_threshold - we don't need much accuracy here so don't
-                * bother with the divide:
-                */
-               if (atomic_read(&ca->congested) < CONGESTED_MAX)
-                       atomic_add(latency_over >>
-                                  max_t(int, ilog2(latency_threshold) - 2, 0),
-                                  &ca->congested);
-
-               ca->congested_last = now;
-       } else if (atomic_read(&ca->congested) > 0) {
-               atomic_dec(&ca->congested);
-       }
-}
-
-void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
-{
-       atomic64_t *latency = &ca->cur_latency[rw];
-       u64 now = local_clock();
-       u64 io_latency = time_after64(now, submit_time)
-               ? now - submit_time
-               : 0;
-       u64 old, new;
-
-       old = atomic64_read(latency);
-       do {
-               /*
-                * If the io latency was reasonably close to the current
-                * latency, skip doing the update and atomic operation - most of
-                * the time:
-                */
-               if (abs((int) (old - io_latency)) < (old >> 1) &&
-                   now & ~(~0U << 5))
-                       break;
-
-               new = ewma_add(old, io_latency, 5);
-       } while (!atomic64_try_cmpxchg(latency, &old, new));
-
-       bch2_congested_acct(ca, io_latency, now, rw);
-
-       __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
-}
-
-#endif
-
-/* Allocate, free from mempool: */
-
-void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
-{
-       struct bvec_iter_all iter;
-       struct bio_vec *bv;
-
-       bio_for_each_segment_all(bv, bio, iter)
-               if (bv->bv_page != ZERO_PAGE(0))
-                       mempool_free(bv->bv_page, &c->bio_bounce_pages);
-       bio->bi_vcnt = 0;
-}
-
-static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
-{
-       struct page *page;
-
-       if (likely(!*using_mempool)) {
-               page = alloc_page(GFP_NOFS);
-               if (unlikely(!page)) {
-                       mutex_lock(&c->bio_bounce_pages_lock);
-                       *using_mempool = true;
-                       goto pool_alloc;
-
-               }
-       } else {
-pool_alloc:
-               page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
-       }
-
-       return page;
-}
-
-void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
-                              size_t size)
-{
-       bool using_mempool = false;
-
-       while (size) {
-               struct page *page = __bio_alloc_page_pool(c, &using_mempool);
-               unsigned len = min_t(size_t, PAGE_SIZE, size);
-
-               BUG_ON(!bio_add_page(bio, page, len, 0));
-               size -= len;
-       }
-
-       if (using_mempool)
-               mutex_unlock(&c->bio_bounce_pages_lock);
-}
-
-/* Extent update path: */
-
-int bch2_sum_sector_overwrites(struct btree_trans *trans,
-                              struct btree_iter *extent_iter,
-                              struct bkey_i *new,
-                              bool *usage_increasing,
-                              s64 *i_sectors_delta,
-                              s64 *disk_sectors_delta)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c old;
-       unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
-       bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
-       int ret = 0;
-
-       *usage_increasing       = false;
-       *i_sectors_delta        = 0;
-       *disk_sectors_delta     = 0;
-
-       bch2_trans_copy_iter(trans, &iter, extent_iter);
-
-       for_each_btree_key_max_continue_norestart(trans, iter,
-                               new->k.p, BTREE_ITER_slots, old, ret) {
-               s64 sectors = min(new->k.p.offset, old.k->p.offset) -
-                       max(bkey_start_offset(&new->k),
-                           bkey_start_offset(old.k));
-
-               *i_sectors_delta += sectors *
-                       (bkey_extent_is_allocation(&new->k) -
-                        bkey_extent_is_allocation(old.k));
-
-               *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
-               *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
-                       ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
-                       : 0;
-
-               if (!*usage_increasing &&
-                   (new->k.p.snapshot != old.k->p.snapshot ||
-                    new_replicas > bch2_bkey_replicas(c, old) ||
-                    (!new_compressed && bch2_bkey_sectors_compressed(old))))
-                       *usage_increasing = true;
-
-               if (bkey_ge(old.k->p, new->k.p))
-                       break;
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
-                                                   struct btree_iter *extent_iter,
-                                                   u64 new_i_size,
-                                                   s64 i_sectors_delta)
-{
-       /*
-        * Crazy performance optimization:
-        * Every extent update needs to also update the inode: the inode trigger
-        * will set bi->journal_seq to the journal sequence number of this
-        * transaction - for fsync.
-        *
-        * But if that's the only reason we're updating the inode (we're not
-        * updating bi_size or bi_sectors), then we don't need the inode update
-        * to be journalled - if we crash, the bi_journal_seq update will be
-        * lost, but that's fine.
-        */
-       unsigned inode_update_flags = BTREE_UPDATE_nojournal;
-
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-                             SPOS(0,
-                                  extent_iter->pos.inode,
-                                  extent_iter->snapshot),
-                             BTREE_ITER_intent|
-                             BTREE_ITER_cached);
-       int ret = bkey_err(k);
-       if (unlikely(ret))
-               return ret;
-
-       /*
-        * varint_decode_fast(), in the inode .invalid method, reads up to 7
-        * bytes past the end of the buffer:
-        */
-       struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
-       ret = PTR_ERR_OR_ZERO(k_mut);
-       if (unlikely(ret))
-               goto err;
-
-       bkey_reassemble(k_mut, k);
-
-       if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
-               k_mut = bch2_inode_to_v3(trans, k_mut);
-               ret = PTR_ERR_OR_ZERO(k_mut);
-               if (unlikely(ret))
-                       goto err;
-       }
-
-       struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
-
-       if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
-           new_i_size > le64_to_cpu(inode->v.bi_size)) {
-               inode->v.bi_size = cpu_to_le64(new_i_size);
-               inode_update_flags = 0;
-       }
-
-       if (i_sectors_delta) {
-               s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors);
-               if (unlikely(bi_sectors + i_sectors_delta < 0)) {
-                       struct bch_fs *c = trans->c;
-                       struct printbuf buf = PRINTBUF;
-                       bch2_log_msg_start(c, &buf);
-                       prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0",
-                                  extent_iter->pos.inode, bi_sectors, i_sectors_delta);
-
-                       bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf);
-                       if (print)
-                               bch2_print_str(c, KERN_ERR, buf.buf);
-                       printbuf_exit(&buf);
-
-                       if (i_sectors_delta < 0)
-                               i_sectors_delta = -bi_sectors;
-                       else
-                               i_sectors_delta = 0;
-               }
-
-               le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
-               inode_update_flags = 0;
-       }
-
-       /*
-        * extents, dirents and xattrs updates require that an inode update also
-        * happens - to ensure that if a key exists in one of those btrees with
-        * a given snapshot ID an inode is also present - so we may have to skip
-        * the nojournal optimization:
-        */
-       if (inode->k.p.snapshot != iter.snapshot) {
-               inode->k.p.snapshot = iter.snapshot;
-               inode_update_flags = 0;
-       }
-
-       ret = bch2_trans_update(trans, &iter, &inode->k_i,
-                               BTREE_UPDATE_internal_snapshot_node|
-                               inode_update_flags);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_extent_update(struct btree_trans *trans,
-                      subvol_inum inum,
-                      struct btree_iter *iter,
-                      struct bkey_i *k,
-                      struct disk_reservation *disk_res,
-                      u64 new_i_size,
-                      s64 *i_sectors_delta_total,
-                      bool check_enospc)
-{
-       struct bpos next_pos;
-       bool usage_increasing;
-       s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-       int ret;
-
-       /*
-        * This traverses us the iterator without changing iter->path->pos to
-        * search_key() (which is pos + 1 for extents): we want there to be a
-        * path already traversed at iter->pos because
-        * bch2_trans_extent_update() will use it to attempt extent merging
-        */
-       ret = __bch2_btree_iter_traverse(trans, iter);
-       if (ret)
-               return ret;
-
-       ret = bch2_extent_trim_atomic(trans, iter, k);
-       if (ret)
-               return ret;
-
-       next_pos = k->k.p;
-
-       ret = bch2_sum_sector_overwrites(trans, iter, k,
-                       &usage_increasing,
-                       &i_sectors_delta,
-                       &disk_sectors_delta);
-       if (ret)
-               return ret;
-
-       if (disk_res &&
-           disk_sectors_delta > (s64) disk_res->sectors) {
-               ret = bch2_disk_reservation_add(trans->c, disk_res,
-                                       disk_sectors_delta - disk_res->sectors,
-                                       !check_enospc || !usage_increasing
-                                       ? BCH_DISK_RESERVATION_NOFAIL : 0);
-               if (ret)
-                       return ret;
-       }
-
-       /*
-        * Note:
-        * We always have to do an inode update - even when i_size/i_sectors
-        * aren't changing - for fsync to work properly; fsync relies on
-        * inode->bi_journal_seq which is updated by the trigger code:
-        */
-       ret =   bch2_extent_update_i_size_sectors(trans, iter,
-                                                 min(k->k.p.offset << 9, new_i_size),
-                                                 i_sectors_delta) ?:
-               bch2_trans_update(trans, iter, k, 0) ?:
-               bch2_trans_commit(trans, disk_res, NULL,
-                               BCH_TRANS_COMMIT_no_check_rw|
-                               BCH_TRANS_COMMIT_no_enospc);
-       if (unlikely(ret))
-               return ret;
-
-       if (i_sectors_delta_total)
-               *i_sectors_delta_total += i_sectors_delta;
-       bch2_btree_iter_set_pos(trans, iter, next_pos);
-       return 0;
-}
-
-static int bch2_write_index_default(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct bkey_buf sk;
-       struct keylist *keys = &op->insert_keys;
-       struct bkey_i *k = bch2_keylist_front(keys);
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       subvol_inum inum = {
-               .subvol = op->subvol,
-               .inum   = k->k.p.inode,
-       };
-       int ret;
-
-       BUG_ON(!inum.subvol);
-
-       bch2_bkey_buf_init(&sk);
-
-       do {
-               bch2_trans_begin(trans);
-
-               k = bch2_keylist_front(keys);
-               bch2_bkey_buf_copy(&sk, c, k);
-
-               ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
-                                                 &sk.k->k.p.snapshot);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       break;
-
-               bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                                    bkey_start_pos(&sk.k->k),
-                                    BTREE_ITER_slots|BTREE_ITER_intent);
-
-               ret =   bch2_extent_update(trans, inum, &iter, sk.k,
-                                       &op->res,
-                                       op->new_i_size, &op->i_sectors_delta,
-                                       op->flags & BCH_WRITE_check_enospc);
-               bch2_trans_iter_exit(trans, &iter);
-
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       break;
-
-               if (bkey_ge(iter.pos, k->k.p))
-                       bch2_keylist_pop_front(&op->insert_keys);
-               else
-                       bch2_cut_front(iter.pos, k);
-       } while (!bch2_keylist_empty(keys));
-
-       bch2_trans_put(trans);
-       bch2_bkey_buf_exit(&sk, c);
-
-       return ret;
-}
-
-/* Writes */
-
-void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
-{
-       struct printbuf buf = PRINTBUF;
-
-       if (op->subvol) {
-               bch2_inum_offset_err_msg(op->c, &buf,
-                                        (subvol_inum) { op->subvol, op->pos.inode, },
-                                        offset << 9);
-       } else {
-               struct bpos pos = op->pos;
-               pos.offset = offset;
-               bch2_inum_snap_offset_err_msg(op->c, &buf, pos);
-       }
-
-       prt_str(&buf, "write error: ");
-
-       va_list args;
-       va_start(args, fmt);
-       prt_vprintf(&buf, fmt, args);
-       va_end(args);
-
-       if (op->flags & BCH_WRITE_move) {
-               struct data_update *u = container_of(op, struct data_update, op);
-
-               prt_printf(&buf, "\n  from internal move ");
-               bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
-       }
-
-       bch_err_ratelimited(op->c, "%s", buf.buf);
-       printbuf_exit(&buf);
-}
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
-                              enum bch_data_type type,
-                              const struct bkey_i *k,
-                              bool nocow)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-       struct bch_write_bio *n;
-       unsigned ref_rw  = type == BCH_DATA_btree ? READ : WRITE;
-       unsigned ref_idx = type == BCH_DATA_btree
-               ? BCH_DEV_READ_REF_btree_node_write
-               : BCH_DEV_WRITE_REF_io_write;
-
-       BUG_ON(c->opts.nochanges);
-
-       const struct bch_extent_ptr *last = NULL;
-       bkey_for_each_ptr(ptrs, ptr)
-               last = ptr;
-
-       bkey_for_each_ptr(ptrs, ptr) {
-               /*
-                * XXX: btree writes should be using io_ref[WRITE], but we
-                * aren't retrying failed btree writes yet (due to device
-                * removal/ro):
-                */
-               struct bch_dev *ca = nocow
-                       ? bch2_dev_have_ref(c, ptr->dev)
-                       : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx);
-
-               if (ptr != last) {
-                       n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
-
-                       n->bio.bi_end_io        = wbio->bio.bi_end_io;
-                       n->bio.bi_private       = wbio->bio.bi_private;
-                       n->parent               = wbio;
-                       n->split                = true;
-                       n->bounce               = false;
-                       n->put_bio              = true;
-                       n->bio.bi_opf           = wbio->bio.bi_opf;
-                       bio_inc_remaining(&wbio->bio);
-               } else {
-                       n = wbio;
-                       n->split                = false;
-               }
-
-               n->c                    = c;
-               n->dev                  = ptr->dev;
-               n->have_ioref           = ca != NULL;
-               n->nocow                = nocow;
-               n->submit_time          = local_clock();
-               n->inode_offset         = bkey_start_offset(&k->k);
-               if (nocow)
-                       n->nocow_bucket = PTR_BUCKET_NR(ca, ptr);
-               n->bio.bi_iter.bi_sector = ptr->offset;
-
-               if (likely(n->have_ioref)) {
-                       this_cpu_add(ca->io_done->sectors[WRITE][type],
-                                    bio_sectors(&n->bio));
-
-                       bio_set_dev(&n->bio, ca->disk_sb.bdev);
-
-                       if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
-                               bio_endio(&n->bio);
-                               continue;
-                       }
-
-                       submit_bio(&n->bio);
-               } else {
-                       n->bio.bi_status        = BLK_STS_REMOVED;
-                       bio_endio(&n->bio);
-               }
-       }
-}
-
-static void __bch2_write(struct bch_write_op *);
-
-static void bch2_write_done(struct closure *cl)
-{
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bch_fs *c = op->c;
-
-       EBUG_ON(op->open_buckets.nr);
-
-       bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
-       bch2_disk_reservation_put(c, &op->res);
-
-       if (!(op->flags & BCH_WRITE_move))
-               enumerated_ref_put(&c->writes, BCH_WRITE_REF_write);
-       bch2_keylist_free(&op->insert_keys, op->inline_keys);
-
-       EBUG_ON(cl->parent);
-       closure_debug_destroy(cl);
-       async_object_list_del(c, write_op, op->list_idx);
-       if (op->end_io)
-               op->end_io(op);
-}
-
-static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct keylist *keys = &op->insert_keys;
-       struct bkey_i *src, *dst = keys->keys, *n;
-
-       for (src = keys->keys; src != keys->top; src = n) {
-               n = bkey_next(src);
-
-               if (bkey_extent_is_direct_data(&src->k)) {
-                       bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
-                                           test_bit(ptr->dev, op->failed.d));
-
-                       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
-                               return bch_err_throw(c, data_write_io);
-               }
-
-               if (dst != src)
-                       memmove_u64s_down(dst, src, src->k.u64s);
-               dst = bkey_next(dst);
-       }
-
-       keys->top = dst;
-       return 0;
-}
-
-/**
- * __bch2_write_index - after a write, update index to point to new data
- * @op:                bch_write_op to process
- */
-static void __bch2_write_index(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct keylist *keys = &op->insert_keys;
-       unsigned dev;
-       int ret = 0;
-
-       if (unlikely(op->flags & BCH_WRITE_io_error)) {
-               ret = bch2_write_drop_io_error_ptrs(op);
-               if (ret)
-                       goto err;
-       }
-
-       if (!bch2_keylist_empty(keys)) {
-               u64 sectors_start = keylist_sectors(keys);
-
-               ret = !(op->flags & BCH_WRITE_move)
-                       ? bch2_write_index_default(op)
-                       : bch2_data_update_index_update(op);
-
-               BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-               BUG_ON(keylist_sectors(keys) && !ret);
-
-               op->written += sectors_start - keylist_sectors(keys);
-
-               if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
-                       struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
-
-                       bch2_write_op_error(op, bkey_start_offset(&insert->k),
-                                           "btree update error: %s", bch2_err_str(ret));
-               }
-
-               if (ret)
-                       goto err;
-       }
-out:
-       /* If some a bucket wasn't written, we can't erasure code it: */
-       for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
-               bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io);
-
-       bch2_open_buckets_put(c, &op->open_buckets);
-       return;
-err:
-       keys->top = keys->keys;
-       op->error = ret;
-       op->flags |= BCH_WRITE_submitted;
-       goto out;
-}
-
-static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
-{
-       if (state != wp->state) {
-               struct task_struct *p = current;
-               u64 now = ktime_get_ns();
-               u64 runtime = p->se.sum_exec_runtime +
-                       (now - p->se.exec_start);
-
-               if (state == WRITE_POINT_runnable)
-                       wp->last_runtime = runtime;
-               else if (wp->state == WRITE_POINT_runnable)
-                       wp->time[WRITE_POINT_running] += runtime - wp->last_runtime;
-
-               if (wp->last_state_change &&
-                   time_after64(now, wp->last_state_change))
-                       wp->time[wp->state] += now - wp->last_state_change;
-               wp->state = state;
-               wp->last_state_change = now;
-       }
-}
-
-static inline void wp_update_state(struct write_point *wp, bool running)
-{
-       enum write_point_state state;
-
-       state = running                  ? WRITE_POINT_runnable:
-               !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
-                                        : WRITE_POINT_stopped;
-
-       __wp_update_state(wp, state);
-}
-
-static CLOSURE_CALLBACK(bch2_write_index)
-{
-       closure_type(op, struct bch_write_op, cl);
-       struct write_point *wp = op->wp;
-       struct workqueue_struct *wq = index_update_wq(op);
-       unsigned long flags;
-
-       if ((op->flags & BCH_WRITE_submitted) &&
-           (op->flags & BCH_WRITE_move))
-               bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
-
-       spin_lock_irqsave(&wp->writes_lock, flags);
-       if (wp->state == WRITE_POINT_waiting_io)
-               __wp_update_state(wp, WRITE_POINT_waiting_work);
-       list_add_tail(&op->wp_list, &wp->writes);
-       spin_unlock_irqrestore (&wp->writes_lock, flags);
-
-       queue_work(wq, &wp->index_update_work);
-}
-
-static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
-{
-       op->wp = wp;
-
-       if (wp->state == WRITE_POINT_stopped) {
-               spin_lock_irq(&wp->writes_lock);
-               __wp_update_state(wp, WRITE_POINT_waiting_io);
-               spin_unlock_irq(&wp->writes_lock);
-       }
-}
-
-void bch2_write_point_do_index_updates(struct work_struct *work)
-{
-       struct write_point *wp =
-               container_of(work, struct write_point, index_update_work);
-       struct bch_write_op *op;
-
-       while (1) {
-               spin_lock_irq(&wp->writes_lock);
-               op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list);
-               wp_update_state(wp, op != NULL);
-               spin_unlock_irq(&wp->writes_lock);
-
-               if (!op)
-                       break;
-
-               op->flags |= BCH_WRITE_in_worker;
-
-               __bch2_write_index(op);
-
-               if (!(op->flags & BCH_WRITE_submitted))
-                       __bch2_write(op);
-               else
-                       bch2_write_done(&op->cl);
-       }
-}
-
-static void bch2_write_endio(struct bio *bio)
-{
-       struct closure *cl              = bio->bi_private;
-       struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
-       struct bch_write_bio *wbio      = to_wbio(bio);
-       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
-       struct bch_fs *c                = wbio->c;
-       struct bch_dev *ca              = wbio->have_ioref
-               ? bch2_dev_have_ref(c, wbio->dev)
-               : NULL;
-
-       bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
-                                  wbio->submit_time, !bio->bi_status);
-
-       if (unlikely(bio->bi_status)) {
-               if (ca)
-                       bch_err_inum_offset_ratelimited(ca,
-                                           op->pos.inode,
-                                           wbio->inode_offset << 9,
-                                           "data write error: %s",
-                                           bch2_blk_status_to_str(bio->bi_status));
-               else
-                       bch_err_inum_offset_ratelimited(c,
-                                           op->pos.inode,
-                                           wbio->inode_offset << 9,
-                                           "data write error: %s",
-                                           bch2_blk_status_to_str(bio->bi_status));
-               set_bit(wbio->dev, op->failed.d);
-               op->flags |= BCH_WRITE_io_error;
-       }
-
-       if (wbio->nocow) {
-               bch2_bucket_nocow_unlock(&c->nocow_locks,
-                                        POS(ca->dev_idx, wbio->nocow_bucket),
-                                        BUCKET_NOCOW_LOCK_UPDATE);
-               set_bit(wbio->dev, op->devs_need_flush->d);
-       }
-
-       if (wbio->have_ioref)
-               enumerated_ref_put(&ca->io_ref[WRITE],
-                                  BCH_DEV_WRITE_REF_io_write);
-
-       if (wbio->bounce)
-               bch2_bio_free_pages_pool(c, bio);
-
-       if (wbio->put_bio)
-               bio_put(bio);
-
-       if (parent)
-               bio_endio(&parent->bio);
-       else
-               closure_put(cl);
-}
-
-static void init_append_extent(struct bch_write_op *op,
-                              struct write_point *wp,
-                              struct bversion version,
-                              struct bch_extent_crc_unpacked crc)
-{
-       struct bkey_i_extent *e;
-
-       op->pos.offset += crc.uncompressed_size;
-
-       e = bkey_extent_init(op->insert_keys.top);
-       e->k.p          = op->pos;
-       e->k.size       = crc.uncompressed_size;
-       e->k.bversion   = version;
-
-       if (crc.csum_type ||
-           crc.compression_type ||
-           crc.nonce)
-               bch2_extent_crc_append(&e->k_i, crc);
-
-       bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
-                                      op->flags & BCH_WRITE_cached);
-
-       if (!(op->flags & BCH_WRITE_move))
-               bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i);
-
-       bch2_keylist_push(&op->insert_keys);
-}
-
-static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
-                                       struct write_point *wp,
-                                       struct bio *src,
-                                       bool *page_alloc_failed,
-                                       void *buf)
-{
-       struct bch_write_bio *wbio;
-       struct bio *bio;
-       unsigned output_available =
-               min(wp->sectors_free << 9, src->bi_iter.bi_size);
-       unsigned pages = DIV_ROUND_UP(output_available +
-                                     (buf
-                                      ? ((unsigned long) buf & (PAGE_SIZE - 1))
-                                      : 0), PAGE_SIZE);
-
-       pages = min(pages, BIO_MAX_VECS);
-
-       bio = bio_alloc_bioset(NULL, pages, 0,
-                              GFP_NOFS, &c->bio_write);
-       wbio                    = wbio_init(bio);
-       wbio->put_bio           = true;
-       /* copy WRITE_SYNC flag */
-       wbio->bio.bi_opf        = src->bi_opf;
-
-       if (buf) {
-               bch2_bio_map(bio, buf, output_available);
-               return bio;
-       }
-
-       wbio->bounce            = true;
-
-       /*
-        * We can't use mempool for more than c->sb.encoded_extent_max
-        * worth of pages, but we'd like to allocate more if we can:
-        */
-       bch2_bio_alloc_pages_pool(c, bio,
-                                 min_t(unsigned, output_available,
-                                       c->opts.encoded_extent_max));
-
-       if (bio->bi_iter.bi_size < output_available)
-               *page_alloc_failed =
-                       bch2_bio_alloc_pages(bio,
-                                            output_available -
-                                            bio->bi_iter.bi_size,
-                                            GFP_NOFS) != 0;
-
-       return bio;
-}
-
-static int bch2_write_rechecksum(struct bch_fs *c,
-                                struct bch_write_op *op,
-                                unsigned new_csum_type)
-{
-       struct bio *bio = &op->wbio.bio;
-       struct bch_extent_crc_unpacked new_crc;
-
-       /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
-
-       if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
-           bch2_csum_type_is_encryption(new_csum_type))
-               new_csum_type = op->crc.csum_type;
-
-       int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
-                                     NULL, &new_crc,
-                                     op->crc.offset, op->crc.live_size,
-                                     new_csum_type);
-       if (ret)
-               return ret;
-
-       bio_advance(bio, op->crc.offset << 9);
-       bio->bi_iter.bi_size = op->crc.live_size << 9;
-       op->crc = new_crc;
-       return 0;
-}
-
-static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
-{
-       struct bch_fs *c = op->c;
-       struct bio *bio = &op->wbio.bio;
-       struct bch_csum csum;
-       int ret = 0;
-
-       BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
-
-       /* Can we just write the entire extent as is? */
-       if (op->crc.uncompressed_size == op->crc.live_size &&
-           op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
-           op->crc.compressed_size <= wp->sectors_free &&
-           (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
-            op->incompressible)) {
-               if (!crc_is_compressed(op->crc) &&
-                   op->csum_type != op->crc.csum_type) {
-                       ret = bch2_write_rechecksum(c, op, op->csum_type);
-                       if (ret)
-                               return ret;
-               }
-
-               return 1;
-       }
-
-       /*
-        * If the data is compressed and we couldn't write the entire extent as
-        * is, we have to decompress it:
-        */
-       if (crc_is_compressed(op->crc)) {
-               /* Last point we can still verify checksum: */
-               struct nonce nonce = extent_nonce(op->version, op->crc);
-               csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
-               if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
-                       goto csum_err;
-
-               if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
-                       ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
-                       if (ret)
-                               return ret;
-
-                       op->crc.csum_type = 0;
-                       op->crc.csum = (struct bch_csum) { 0, 0 };
-               }
-
-               ret = bch2_bio_uncompress_inplace(op, bio);
-               if (ret)
-                       return ret;
-       }
-
-       /*
-        * No longer have compressed data after this point - data might be
-        * encrypted:
-        */
-
-       /*
-        * If the data is checksummed and we're only writing a subset,
-        * rechecksum and adjust bio to point to currently live data:
-        */
-       if (op->crc.live_size != op->crc.uncompressed_size ||
-           op->crc.csum_type != op->csum_type) {
-               ret = bch2_write_rechecksum(c, op, op->csum_type);
-               if (ret)
-                       return ret;
-       }
-
-       /*
-        * If we want to compress the data, it has to be decrypted:
-        */
-       if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
-           (op->compression_opt || op->crc.csum_type != op->csum_type)) {
-               struct nonce nonce = extent_nonce(op->version, op->crc);
-               csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
-               if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
-                       goto csum_err;
-
-               ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
-               if (ret)
-                       return ret;
-
-               op->crc.csum_type = 0;
-               op->crc.csum = (struct bch_csum) { 0, 0 };
-       }
-
-       return 0;
-csum_err:
-       bch2_write_op_error(op, op->pos.offset,
-               "error verifying existing checksum while moving existing data (memory corruption?)\n"
-               "  expected %0llx:%0llx got %0llx:%0llx type %s",
-               op->crc.csum.hi,
-               op->crc.csum.lo,
-               csum.hi,
-               csum.lo,
-               op->crc.csum_type < BCH_CSUM_NR
-               ? __bch2_csum_types[op->crc.csum_type]
-               : "(unknown)");
-       return bch_err_throw(c, data_write_csum);
-}
-
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
-                            struct bio **_dst)
-{
-       struct bch_fs *c = op->c;
-       struct bio *src = &op->wbio.bio, *dst = src;
-       struct bvec_iter saved_iter;
-       void *ec_buf;
-       unsigned total_output = 0, total_input = 0;
-       bool bounce = false;
-       bool page_alloc_failed = false;
-       int ret, more = 0;
-
-       if (op->incompressible)
-               op->compression_opt = 0;
-
-       BUG_ON(!bio_sectors(src));
-
-       ec_buf = bch2_writepoint_ec_buf(c, wp);
-
-       if (unlikely(op->flags & BCH_WRITE_data_encoded)) {
-               ret = bch2_write_prep_encoded_data(op, wp);
-               if (ret < 0)
-                       goto err;
-               if (ret) {
-                       if (ec_buf) {
-                               dst = bch2_write_bio_alloc(c, wp, src,
-                                                          &page_alloc_failed,
-                                                          ec_buf);
-                               bio_copy_data(dst, src);
-                               bounce = true;
-                       }
-                       init_append_extent(op, wp, op->version, op->crc);
-                       goto do_write;
-               }
-       }
-
-       if (ec_buf ||
-           op->compression_opt ||
-           (op->csum_type &&
-            !(op->flags & BCH_WRITE_pages_stable)) ||
-           (bch2_csum_type_is_encryption(op->csum_type) &&
-            !(op->flags & BCH_WRITE_pages_owned))) {
-               dst = bch2_write_bio_alloc(c, wp, src,
-                                          &page_alloc_failed,
-                                          ec_buf);
-               bounce = true;
-       }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
-       if (!bounce && write_corrupt_ratio) {
-               dst = bch2_write_bio_alloc(c, wp, src,
-                                          &page_alloc_failed,
-                                          ec_buf);
-               bounce = true;
-       }
-#endif
-       saved_iter = dst->bi_iter;
-
-       do {
-               struct bch_extent_crc_unpacked crc = { 0 };
-               struct bversion version = op->version;
-               size_t dst_len = 0, src_len = 0;
-
-               if (page_alloc_failed &&
-                   dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
-                   dst->bi_iter.bi_size < c->opts.encoded_extent_max)
-                       break;
-
-               BUG_ON(op->compression_opt &&
-                      (op->flags & BCH_WRITE_data_encoded) &&
-                      bch2_csum_type_is_encryption(op->crc.csum_type));
-               BUG_ON(op->compression_opt && !bounce);
-
-               crc.compression_type = op->incompressible
-                       ? BCH_COMPRESSION_TYPE_incompressible
-                       : op->compression_opt
-                       ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
-                                           op->compression_opt)
-                       : 0;
-               if (!crc_is_compressed(crc)) {
-                       dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-                       dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
-
-                       if (op->csum_type)
-                               dst_len = min_t(unsigned, dst_len,
-                                               c->opts.encoded_extent_max);
-
-                       if (bounce) {
-                               swap(dst->bi_iter.bi_size, dst_len);
-                               bio_copy_data(dst, src);
-                               swap(dst->bi_iter.bi_size, dst_len);
-                       }
-
-                       src_len = dst_len;
-               }
-
-               BUG_ON(!src_len || !dst_len);
-
-               if (bch2_csum_type_is_encryption(op->csum_type)) {
-                       if (bversion_zero(version)) {
-                               version.lo = atomic64_inc_return(&c->key_version);
-                       } else {
-                               crc.nonce = op->nonce;
-                               op->nonce += src_len >> 9;
-                       }
-               }
-
-               if ((op->flags & BCH_WRITE_data_encoded) &&
-                   !crc_is_compressed(crc) &&
-                   bch2_csum_type_is_encryption(op->crc.csum_type) ==
-                   bch2_csum_type_is_encryption(op->csum_type)) {
-                       u8 compression_type = crc.compression_type;
-                       u16 nonce = crc.nonce;
-                       /*
-                        * Note: when we're using rechecksum(), we need to be
-                        * checksumming @src because it has all the data our
-                        * existing checksum covers - if we bounced (because we
-                        * were trying to compress), @dst will only have the
-                        * part of the data the new checksum will cover.
-                        *
-                        * But normally we want to be checksumming post bounce,
-                        * because part of the reason for bouncing is so the
-                        * data can't be modified (by userspace) while it's in
-                        * flight.
-                        */
-                       ret = bch2_rechecksum_bio(c, src, version, op->crc,
-                                       &crc, &op->crc,
-                                       src_len >> 9,
-                                       bio_sectors(src) - (src_len >> 9),
-                                       op->csum_type);
-                       if (ret)
-                               goto err;
-                       /*
-                        * rchecksum_bio sets compression_type on crc from op->crc,
-                        * this isn't always correct as sometimes we're changing
-                        * an extent from uncompressed to incompressible.
-                        */
-                       crc.compression_type = compression_type;
-                       crc.nonce = nonce;
-               } else {
-                       if ((op->flags & BCH_WRITE_data_encoded) &&
-                           (ret = bch2_rechecksum_bio(c, src, version, op->crc,
-                                       NULL, &op->crc,
-                                       src_len >> 9,
-                                       bio_sectors(src) - (src_len >> 9),
-                                       op->crc.csum_type)))
-                               goto err;
-
-                       crc.compressed_size     = dst_len >> 9;
-                       crc.uncompressed_size   = src_len >> 9;
-                       crc.live_size           = src_len >> 9;
-
-                       swap(dst->bi_iter.bi_size, dst_len);
-                       ret = bch2_encrypt_bio(c, op->csum_type,
-                                              extent_nonce(version, crc), dst);
-                       if (ret)
-                               goto err;
-
-                       crc.csum = bch2_checksum_bio(c, op->csum_type,
-                                        extent_nonce(version, crc), dst);
-                       crc.csum_type = op->csum_type;
-                       swap(dst->bi_iter.bi_size, dst_len);
-               }
-
-               init_append_extent(op, wp, version, crc);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-               if (write_corrupt_ratio) {
-                       swap(dst->bi_iter.bi_size, dst_len);
-                       bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
-                       swap(dst->bi_iter.bi_size, dst_len);
-               }
-#endif
-
-               if (dst != src)
-                       bio_advance(dst, dst_len);
-               bio_advance(src, src_len);
-               total_output    += dst_len;
-               total_input     += src_len;
-       } while (dst->bi_iter.bi_size &&
-                src->bi_iter.bi_size &&
-                wp->sectors_free &&
-                !bch2_keylist_realloc(&op->insert_keys,
-                                     op->inline_keys,
-                                     ARRAY_SIZE(op->inline_keys),
-                                     BKEY_EXTENT_U64s_MAX));
-
-       more = src->bi_iter.bi_size != 0;
-
-       dst->bi_iter = saved_iter;
-
-       if (dst == src && more) {
-               BUG_ON(total_output != total_input);
-
-               dst = bio_split(src, total_input >> 9,
-                               GFP_NOFS, &c->bio_write);
-               wbio_init(dst)->put_bio = true;
-               /* copy WRITE_SYNC flag */
-               dst->bi_opf             = src->bi_opf;
-       }
-
-       dst->bi_iter.bi_size = total_output;
-do_write:
-       *_dst = dst;
-       return more;
-err:
-       if (to_wbio(dst)->bounce)
-               bch2_bio_free_pages_pool(c, dst);
-       if (to_wbio(dst)->put_bio)
-               bio_put(dst);
-
-       return ret;
-}
-
-static bool bch2_extent_is_writeable(struct bch_write_op *op,
-                                    struct bkey_s_c k)
-{
-       struct bch_fs *c = op->c;
-       struct bkey_s_c_extent e;
-       struct extent_ptr_decoded p;
-       const union bch_extent_entry *entry;
-       unsigned replicas = 0;
-
-       if (k.k->type != KEY_TYPE_extent)
-               return false;
-
-       e = bkey_s_c_to_extent(k);
-
-       guard(rcu)();
-       extent_for_each_ptr_decode(e, p, entry) {
-               if (crc_is_encoded(p.crc) || p.has_ec)
-                       return false;
-
-               replicas += bch2_extent_ptr_durability(c, &p);
-       }
-
-       return replicas >= op->opts.data_replicas;
-}
-
-static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
-                                                 struct btree_iter *iter,
-                                                 struct bkey_i *orig,
-                                                 struct bkey_s_c k,
-                                                 u64 new_i_size)
-{
-       if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
-               /* trace this */
-               return 0;
-       }
-
-       struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-       int ret = PTR_ERR_OR_ZERO(new);
-       if (ret)
-               return ret;
-
-       bch2_cut_front(bkey_start_pos(&orig->k), new);
-       bch2_cut_back(orig->k.p, new);
-
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-       bkey_for_each_ptr(ptrs, ptr)
-               ptr->unwritten = 0;
-
-       /*
-        * Note that we're not calling bch2_subvol_get_snapshot() in this path -
-        * that was done when we kicked off the write, and here it's important
-        * that we update the extent that we wrote to - even if a snapshot has
-        * since been created. The write is still outstanding, so we're ok
-        * w.r.t. snapshot atomicity:
-        */
-       return  bch2_extent_update_i_size_sectors(trans, iter,
-                                       min(new->k.p.offset << 9, new_i_size), 0) ?:
-               bch2_trans_update(trans, iter, new,
-                                 BTREE_UPDATE_internal_snapshot_node);
-}
-
-static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct btree_trans *trans = bch2_trans_get(c);
-       int ret = 0;
-
-       for_each_keylist_key(&op->insert_keys, orig) {
-               ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
-                                    bkey_start_pos(&orig->k), orig->k.p,
-                                    BTREE_ITER_intent, k,
-                                    NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-                       bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
-               }));
-               if (ret)
-                       break;
-       }
-
-       bch2_trans_put(trans);
-
-       if (ret && !bch2_err_matches(ret, EROFS)) {
-               struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
-               bch2_write_op_error(op, bkey_start_offset(&insert->k),
-                                   "btree update error: %s", bch2_err_str(ret));
-       }
-
-       if (ret)
-               op->error = ret;
-}
-
-static void __bch2_nocow_write_done(struct bch_write_op *op)
-{
-       if (unlikely(op->flags & BCH_WRITE_io_error)) {
-               op->error = bch_err_throw(op->c, data_write_io);
-       } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
-               bch2_nocow_write_convert_unwritten(op);
-}
-
-static CLOSURE_CALLBACK(bch2_nocow_write_done)
-{
-       closure_type(op, struct bch_write_op, cl);
-
-       __bch2_nocow_write_done(op);
-       bch2_write_done(cl);
-}
-
-struct bucket_to_lock {
-       struct bpos             b;
-       unsigned                gen;
-       struct nocow_lock_bucket *l;
-};
-
-static void bch2_nocow_write(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct btree_trans *trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
-       u32 snapshot;
-       struct bucket_to_lock *stale_at;
-       int stale, ret;
-
-       if (op->flags & BCH_WRITE_move)
-               return;
-
-       darray_init(&buckets);
-       trans = bch2_trans_get(c);
-retry:
-       bch2_trans_begin(trans);
-
-       ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
-       if (unlikely(ret))
-               goto err;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            SPOS(op->pos.inode, op->pos.offset, snapshot),
-                            BTREE_ITER_slots);
-       while (1) {
-               struct bio *bio = &op->wbio.bio;
-
-               buckets.nr = 0;
-
-               ret = bch2_trans_relock(trans);
-               if (ret)
-                       break;
-
-               k = bch2_btree_iter_peek_slot(trans, &iter);
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-
-               /* fall back to normal cow write path? */
-               if (unlikely(k.k->p.snapshot != snapshot ||
-                            !bch2_extent_is_writeable(op, k)))
-                       break;
-
-               if (bch2_keylist_realloc(&op->insert_keys,
-                                        op->inline_keys,
-                                        ARRAY_SIZE(op->inline_keys),
-                                        k.k->u64s))
-                       break;
-
-               /* Get iorefs before dropping btree locks: */
-               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-               bkey_for_each_ptr(ptrs, ptr) {
-                       struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
-                                                       BCH_DEV_WRITE_REF_io_write);
-                       if (unlikely(!ca))
-                               goto err_get_ioref;
-
-                       struct bpos b = PTR_BUCKET_POS(ca, ptr);
-                       struct nocow_lock_bucket *l =
-                               bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
-                       prefetch(l);
-
-                       /* XXX allocating memory with btree locks held - rare */
-                       darray_push_gfp(&buckets, ((struct bucket_to_lock) {
-                                                  .b = b, .gen = ptr->gen, .l = l,
-                                                  }), GFP_KERNEL|__GFP_NOFAIL);
-
-                       if (ptr->unwritten)
-                               op->flags |= BCH_WRITE_convert_unwritten;
-               }
-
-               /* Unlock before taking nocow locks, doing IO: */
-               bkey_reassemble(op->insert_keys.top, k);
-               bch2_trans_unlock(trans);
-
-               bch2_cut_front(op->pos, op->insert_keys.top);
-               if (op->flags & BCH_WRITE_convert_unwritten)
-                       bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
-
-               darray_for_each(buckets, i) {
-                       struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
-
-                       __bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
-                                                bucket_to_u64(i->b),
-                                                BUCKET_NOCOW_LOCK_UPDATE);
-
-                       int gen = bucket_gen_get(ca, i->b.offset);
-                       stale = gen < 0 ? gen : gen_after(gen, i->gen);
-                       if (unlikely(stale)) {
-                               stale_at = i;
-                               goto err_bucket_stale;
-                       }
-               }
-
-               bio = &op->wbio.bio;
-               if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
-                       bio = bio_split(bio, k.k->p.offset - op->pos.offset,
-                                       GFP_KERNEL, &c->bio_write);
-                       wbio_init(bio)->put_bio = true;
-                       bio->bi_opf = op->wbio.bio.bi_opf;
-               } else {
-                       op->flags |= BCH_WRITE_submitted;
-               }
-
-               op->pos.offset += bio_sectors(bio);
-               op->written += bio_sectors(bio);
-
-               bio->bi_end_io  = bch2_write_endio;
-               bio->bi_private = &op->cl;
-               bio->bi_opf |= REQ_OP_WRITE;
-               closure_get(&op->cl);
-
-               bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
-                                         op->insert_keys.top, true);
-
-               bch2_keylist_push(&op->insert_keys);
-               if (op->flags & BCH_WRITE_submitted)
-                       break;
-               bch2_btree_iter_advance(trans, &iter);
-       }
-out:
-       bch2_trans_iter_exit(trans, &iter);
-err:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-
-       bch2_trans_put(trans);
-       darray_exit(&buckets);
-
-       if (ret) {
-               bch2_write_op_error(op, op->pos.offset,
-                                   "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
-               op->error = ret;
-               op->flags |= BCH_WRITE_submitted;
-       }
-
-       /* fallback to cow write path? */
-       if (!(op->flags & BCH_WRITE_submitted)) {
-               closure_sync(&op->cl);
-               __bch2_nocow_write_done(op);
-               op->insert_keys.top = op->insert_keys.keys;
-       } else if (op->flags & BCH_WRITE_sync) {
-               closure_sync(&op->cl);
-               bch2_nocow_write_done(&op->cl.work);
-       } else {
-               /*
-                * XXX
-                * needs to run out of process context because ei_quota_lock is
-                * a mutex
-                */
-               continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
-       }
-       return;
-err_get_ioref:
-       darray_for_each(buckets, i)
-               enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
-                                  BCH_DEV_WRITE_REF_io_write);
-
-       /* Fall back to COW path: */
-       goto out;
-err_bucket_stale:
-       darray_for_each(buckets, i) {
-               bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
-               if (i == stale_at)
-                       break;
-       }
-
-       struct printbuf buf = PRINTBUF;
-       if (bch2_fs_inconsistent_on(stale < 0, c,
-                                   "pointer to invalid bucket in nocow path on device %llu\n  %s",
-                                   stale_at->b.inode,
-                                   (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               ret = bch_err_throw(c, data_write_invalid_ptr);
-       } else {
-               /* We can retry this: */
-               ret = bch_err_throw(c, transaction_restart);
-       }
-       printbuf_exit(&buf);
-
-       goto err_get_ioref;
-}
-
-static void __bch2_write(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct write_point *wp = NULL;
-       struct bio *bio = NULL;
-       unsigned nofs_flags;
-       int ret;
-
-       nofs_flags = memalloc_nofs_save();
-
-       if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
-               bch2_nocow_write(op);
-               if (op->flags & BCH_WRITE_submitted)
-                       goto out_nofs_restore;
-       }
-again:
-       memset(&op->failed, 0, sizeof(op->failed));
-
-       do {
-               struct bkey_i *key_to_write;
-               unsigned key_to_write_offset = op->insert_keys.top_p -
-                       op->insert_keys.keys_p;
-
-               /* +1 for possible cache device: */
-               if (op->open_buckets.nr + op->nr_replicas + 1 >
-                   ARRAY_SIZE(op->open_buckets.v))
-                       break;
-
-               if (bch2_keylist_realloc(&op->insert_keys,
-                                       op->inline_keys,
-                                       ARRAY_SIZE(op->inline_keys),
-                                       BKEY_EXTENT_U64s_MAX))
-                       break;
-
-               /*
-                * The copygc thread is now global, which means it's no longer
-                * freeing up space on specific disks, which means that
-                * allocations for specific disks may hang arbitrarily long:
-                */
-               ret = bch2_trans_run(c, lockrestart_do(trans,
-                       bch2_alloc_sectors_start_trans(trans,
-                               op->target,
-                               op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
-                               op->write_point,
-                               &op->devs_have,
-                               op->nr_replicas,
-                               op->nr_replicas_required,
-                               op->watermark,
-                               op->flags,
-                               &op->cl, &wp)));
-               if (unlikely(ret)) {
-                       if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-                               break;
-
-                       goto err;
-               }
-
-               EBUG_ON(!wp);
-
-               bch2_open_bucket_get(c, wp, &op->open_buckets);
-               ret = bch2_write_extent(op, wp, &bio);
-
-               bch2_alloc_sectors_done_inlined(c, wp);
-err:
-               if (ret <= 0) {
-                       op->flags |= BCH_WRITE_submitted;
-
-                       if (unlikely(ret < 0)) {
-                               if (!(op->flags & BCH_WRITE_alloc_nowait))
-                                       bch2_write_op_error(op, op->pos.offset,
-                                                           "%s(): %s", __func__, bch2_err_str(ret));
-                               op->error = ret;
-                               break;
-                       }
-               }
-
-               bio->bi_end_io  = bch2_write_endio;
-               bio->bi_private = &op->cl;
-               bio->bi_opf |= REQ_OP_WRITE;
-
-               closure_get(bio->bi_private);
-
-               key_to_write = (void *) (op->insert_keys.keys_p +
-                                        key_to_write_offset);
-
-               bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
-                                         key_to_write, false);
-       } while (ret);
-
-       /*
-        * Sync or no?
-        *
-        * If we're running asynchronously, wne may still want to block
-        * synchronously here if we weren't able to submit all of the IO at
-        * once, as that signals backpressure to the caller.
-        */
-       if ((op->flags & BCH_WRITE_sync) ||
-           (!(op->flags & BCH_WRITE_submitted) &&
-            !(op->flags & BCH_WRITE_in_worker))) {
-               bch2_wait_on_allocator(c, &op->cl);
-
-               __bch2_write_index(op);
-
-               if (!(op->flags & BCH_WRITE_submitted))
-                       goto again;
-               bch2_write_done(&op->cl);
-       } else {
-               bch2_write_queue(op, wp);
-               continue_at(&op->cl, bch2_write_index, NULL);
-       }
-out_nofs_restore:
-       memalloc_nofs_restore(nofs_flags);
-}
-
-static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
-{
-       struct bio *bio = &op->wbio.bio;
-       struct bvec_iter iter;
-       struct bkey_i_inline_data *id;
-       unsigned sectors;
-       int ret;
-
-       memset(&op->failed, 0, sizeof(op->failed));
-
-       op->flags |= BCH_WRITE_wrote_data_inline;
-       op->flags |= BCH_WRITE_submitted;
-
-       bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
-
-       ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
-                                  ARRAY_SIZE(op->inline_keys),
-                                  BKEY_U64s + DIV_ROUND_UP(data_len, 8));
-       if (ret) {
-               op->error = ret;
-               goto err;
-       }
-
-       sectors = bio_sectors(bio);
-       op->pos.offset += sectors;
-
-       id = bkey_inline_data_init(op->insert_keys.top);
-       id->k.p         = op->pos;
-       id->k.bversion  = op->version;
-       id->k.size      = sectors;
-
-       iter = bio->bi_iter;
-       iter.bi_size = data_len;
-       memcpy_from_bio(id->v.data, bio, iter);
-
-       while (data_len & 7)
-               id->v.data[data_len++] = '\0';
-       set_bkey_val_bytes(&id->k, data_len);
-       bch2_keylist_push(&op->insert_keys);
-
-       __bch2_write_index(op);
-err:
-       bch2_write_done(&op->cl);
-}
-
-/**
- * bch2_write() - handle a write to a cache device or flash only volume
- * @cl:                &bch_write_op->cl
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-CLOSURE_CALLBACK(bch2_write)
-{
-       closure_type(op, struct bch_write_op, cl);
-       struct bio *bio = &op->wbio.bio;
-       struct bch_fs *c = op->c;
-       unsigned data_len;
-
-       EBUG_ON(op->cl.parent);
-       BUG_ON(!op->nr_replicas);
-       BUG_ON(!op->write_point.v);
-       BUG_ON(bkey_eq(op->pos, POS_MAX));
-
-       async_object_list_add(c, write_op, op, &op->list_idx);
-
-       if (op->flags & BCH_WRITE_only_specified_devs)
-               op->flags |= BCH_WRITE_alloc_nowait;
-
-       op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
-       op->start_time = local_clock();
-       bch2_keylist_init(&op->insert_keys, op->inline_keys);
-       wbio_init(bio)->put_bio = false;
-
-       if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
-               bch2_write_op_error(op, op->pos.offset, "misaligned write");
-               op->error = bch_err_throw(c, data_write_misaligned);
-               goto err;
-       }
-
-       if (c->opts.nochanges) {
-               op->error = bch_err_throw(c, erofs_no_writes);
-               goto err;
-       }
-
-       if (!(op->flags & BCH_WRITE_move) &&
-           !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) {
-               op->error = bch_err_throw(c, erofs_no_writes);
-               goto err;
-       }
-
-       if (!(op->flags & BCH_WRITE_move))
-               this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
-       bch2_increment_clock(c, bio_sectors(bio), WRITE);
-
-       data_len = min_t(u64, bio->bi_iter.bi_size,
-                        op->new_i_size - (op->pos.offset << 9));
-
-       if (c->opts.inline_data &&
-           data_len <= min(block_bytes(c) / 2, 1024U)) {
-               bch2_write_data_inline(op, data_len);
-               return;
-       }
-
-       __bch2_write(op);
-       return;
-err:
-       bch2_disk_reservation_put(c, &op->res);
-
-       closure_debug_destroy(&op->cl);
-       async_object_list_del(c, write_op, op->list_idx);
-       if (op->end_io)
-               op->end_io(op);
-}
-
-static const char * const bch2_write_flags[] = {
-#define x(f)   #f,
-       BCH_WRITE_FLAGS()
-#undef x
-       NULL
-};
-
-void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
-{
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 32);
-
-       prt_printf(out, "pos:\t");
-       bch2_bpos_to_text(out, op->pos);
-       prt_newline(out);
-       printbuf_indent_add(out, 2);
-
-       prt_printf(out, "started:\t");
-       bch2_pr_time_units(out, local_clock() - op->start_time);
-       prt_newline(out);
-
-       prt_printf(out, "flags:\t");
-       prt_bitflags(out, bch2_write_flags, op->flags);
-       prt_newline(out);
-
-       prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
-       prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
-
-       prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
-       prt_printf(out, "ret\t%s\n", bch2_err_str(op->error));
-
-       printbuf_indent_sub(out, 2);
-}
-
-void bch2_fs_io_write_exit(struct bch_fs *c)
-{
-       bioset_exit(&c->replica_set);
-       bioset_exit(&c->bio_write);
-}
-
-int bch2_fs_io_write_init(struct bch_fs *c)
-{
-       if (bioset_init(&c->bio_write,   1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
-           bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
-               return bch_err_throw(c, ENOMEM_bio_write_init);
-
-       return 0;
-}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
deleted file mode 100644 (file)
index 2c0a8f3..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_WRITE_H
-#define _BCACHEFS_IO_WRITE_H
-
-#include "checksum.h"
-#include "io_write_types.h"
-
-#define to_wbio(_bio)                  \
-       container_of((_bio), struct bch_write_bio, bio)
-
-void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
-                              enum bch_data_type, const struct bkey_i *, bool);
-
-__printf(3, 4)
-void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...);
-
-static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
-       return op->watermark == BCH_WATERMARK_copygc
-               ? op->c->copygc_wq
-               : op->c->btree_update_wq;
-}
-
-int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
-                              struct bkey_i *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, subvol_inum,
-                      struct btree_iter *, struct bkey_i *,
-                      struct disk_reservation *, u64, s64 *, bool);
-
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-                                     struct bch_io_opts opts)
-{
-       op->c                   = c;
-       op->end_io              = NULL;
-       op->flags               = 0;
-       op->written             = 0;
-       op->error               = 0;
-       op->csum_type           = bch2_data_checksum_type(c, opts);
-       op->compression_opt     = opts.compression;
-       op->nr_replicas         = 0;
-       op->nr_replicas_required = c->opts.data_replicas_required;
-       op->watermark           = BCH_WATERMARK_normal;
-       op->incompressible      = 0;
-       op->open_buckets.nr     = 0;
-       op->devs_have.nr        = 0;
-       op->target              = 0;
-       op->opts                = opts;
-       op->subvol              = 0;
-       op->pos                 = POS_MAX;
-       op->version             = ZERO_VERSION;
-       op->write_point         = (struct write_point_specifier) { 0 };
-       op->res                 = (struct disk_reservation) { 0 };
-       op->new_i_size          = U64_MAX;
-       op->i_sectors_delta     = 0;
-       op->devs_need_flush     = NULL;
-}
-
-CLOSURE_CALLBACK(bch2_write);
-void bch2_write_point_do_index_updates(struct work_struct *);
-
-static inline struct bch_write_bio *wbio_init(struct bio *bio)
-{
-       struct bch_write_bio *wbio = to_wbio(bio);
-
-       memset(&wbio->wbio, 0, sizeof(wbio->wbio));
-       return wbio;
-}
-
-void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
-
-void bch2_fs_io_write_exit(struct bch_fs *);
-int bch2_fs_io_write_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
deleted file mode 100644 (file)
index 5da4eb8..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_WRITE_TYPES_H
-#define _BCACHEFS_IO_WRITE_TYPES_H
-
-#include "alloc_types.h"
-#include "btree_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-#include "keylist_types.h"
-#include "opts.h"
-#include "super_types.h"
-
-#include <linux/llist.h>
-#include <linux/workqueue.h>
-
-#define BCH_WRITE_FLAGS()              \
-       x(alloc_nowait)                 \
-       x(cached)                       \
-       x(data_encoded)                 \
-       x(pages_stable)                 \
-       x(pages_owned)                  \
-       x(only_specified_devs)          \
-       x(wrote_data_inline)            \
-       x(check_enospc)                 \
-       x(sync)                         \
-       x(move)                         \
-       x(in_worker)                    \
-       x(submitted)                    \
-       x(io_error)                     \
-       x(convert_unwritten)
-
-enum __bch_write_flags {
-#define x(f)   __BCH_WRITE_##f,
-       BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f)   BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
-       BCH_WRITE_FLAGS()
-#undef x
-};
-
-struct bch_write_bio {
-       struct_group(wbio,
-       struct bch_fs           *c;
-       struct bch_write_bio    *parent;
-
-       u64                     submit_time;
-       u64                     inode_offset;
-       u64                     nocow_bucket;
-
-       struct bch_devs_list    failed;
-       u8                      dev;
-
-       unsigned                split:1,
-                               bounce:1,
-                               put_bio:1,
-                               have_ioref:1,
-                               nocow:1,
-                               used_mempool:1,
-                               first_btree_write:1;
-       );
-
-       struct bio              bio;
-};
-
-struct bch_write_op {
-       struct closure          cl;
-       struct bch_fs           *c;
-       void                    (*end_io)(struct bch_write_op *);
-       u64                     start_time;
-
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-       unsigned                list_idx;
-#endif
-
-       unsigned                written; /* sectors */
-       u16                     flags;
-       s16                     error; /* dio write path expects it to hold -ERESTARTSYS... */
-
-       unsigned                compression_opt:8;
-       unsigned                csum_type:4;
-       unsigned                nr_replicas:4;
-       unsigned                nr_replicas_required:4;
-       unsigned                watermark:3;
-       unsigned                incompressible:1;
-       unsigned                stripe_waited:1;
-
-       struct bch_devs_list    devs_have;
-       u16                     target;
-       u16                     nonce;
-       struct bch_io_opts      opts;
-
-       u32                     subvol;
-       struct bpos             pos;
-       struct bversion         version;
-
-       /* For BCH_WRITE_data_encoded: */
-       struct bch_extent_crc_unpacked crc;
-
-       struct write_point_specifier write_point;
-
-       struct write_point      *wp;
-       struct list_head        wp_list;
-
-       struct disk_reservation res;
-
-       struct open_buckets     open_buckets;
-
-       u64                     new_i_size;
-       s64                     i_sectors_delta;
-
-       struct bch_devs_mask    failed;
-
-       struct keylist          insert_keys;
-       u64                     inline_keys[BKEY_EXTENT_U64s_MAX * 2];
-
-       /*
-        * Bitmask of devices that have had nocow writes issued to them since
-        * last flush:
-        */
-       struct bch_devs_mask    *devs_need_flush;
-
-       /* Must be last: */
-       struct bch_write_bio    wbio;
-};
-
-#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
deleted file mode 100644 (file)
index ddfeb0d..0000000
+++ /dev/null
@@ -1,1832 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcachefs journalling code, for btree insertions
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_methods.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_sb.h"
-#include "journal_seq_blacklist.h"
-#include "trace.h"
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
-       return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
-       return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
-       return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
-       return __journal_entry_is_open(j->reservations);
-}
-
-static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
-{
-       union journal_res_state s = READ_ONCE(j->reservations);
-       unsigned i = seq & JOURNAL_BUF_MASK;
-       struct journal_buf *buf = j->buf + i;
-
-       prt_printf(out, "seq:\t%llu\n", seq);
-       printbuf_indent_add(out, 2);
-
-       if (!buf->write_started)
-               prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK));
-
-       struct closure *cl = &buf->io;
-       int r = atomic_read(&cl->remaining);
-       prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK);
-
-       if (buf->data) {
-               prt_printf(out, "size:\t");
-               prt_human_readable_u64(out, vstruct_bytes(buf->data));
-               prt_newline(out);
-       }
-
-       prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies);
-
-       prt_printf(out, "flags:\t");
-       if (buf->noflush)
-               prt_str(out, "noflush ");
-       if (buf->must_flush)
-               prt_str(out, "must_flush ");
-       if (buf->separate_flush)
-               prt_str(out, "separate_flush ");
-       if (buf->need_flush_to_write_buffer)
-               prt_str(out, "need_flush_to_write_buffer ");
-       if (buf->write_started)
-               prt_str(out, "write_started ");
-       if (buf->write_allocated)
-               prt_str(out, "write_allocated ");
-       if (buf->write_done)
-               prt_str(out, "write_done");
-       prt_newline(out);
-
-       printbuf_indent_sub(out, 2);
-}
-
-static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
-{
-       lockdep_assert_held(&j->lock);
-       out->atomic++;
-
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 24);
-
-       for (u64 seq = journal_last_unwritten_seq(j);
-            seq <= journal_cur_seq(j);
-            seq++)
-               bch2_journal_buf_to_text(out, j, seq);
-       prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
-
-       --out->atomic;
-}
-
-static inline struct journal_buf *
-journal_seq_to_buf(struct journal *j, u64 seq)
-{
-       struct journal_buf *buf = NULL;
-
-       EBUG_ON(seq > journal_cur_seq(j));
-
-       if (journal_seq_unwritten(j, seq))
-               buf = j->buf + (seq & JOURNAL_BUF_MASK);
-       return buf;
-}
-
-static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
-{
-       for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
-               INIT_LIST_HEAD(&p->unflushed[i]);
-       for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
-               INIT_LIST_HEAD(&p->flushed[i]);
-       atomic_set(&p->count, count);
-       p->devs.nr = 0;
-}
-
-/*
- * Detect stuck journal conditions and trigger shutdown. Technically the journal
- * can end up stuck for a variety of reasons, such as a blocked I/O, journal
- * reservation lockup, etc. Since this is a fatal error with potentially
- * unpredictable characteristics, we want to be fairly conservative before we
- * decide to shut things down.
- *
- * Consider the journal stuck when it appears full with no ability to commit
- * btree transactions, to discard journal buckets, nor acquire priority
- * (reserved watermark) reservation.
- */
-static inline bool
-journal_error_check_stuck(struct journal *j, int error, unsigned flags)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       bool stuck = false;
-       struct printbuf buf = PRINTBUF;
-
-       buf.atomic++;
-
-       if (!(error == -BCH_ERR_journal_full ||
-             error == -BCH_ERR_journal_pin_full) ||
-           nr_unwritten_journal_entries(j) ||
-           (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
-               return stuck;
-
-       spin_lock(&j->lock);
-
-       if (j->can_discard) {
-               spin_unlock(&j->lock);
-               return stuck;
-       }
-
-       stuck = true;
-
-       /*
-        * The journal shutdown path will set ->err_seq, but do it here first to
-        * serialize against concurrent failures and avoid duplicate error
-        * reports.
-        */
-       if (j->err_seq) {
-               spin_unlock(&j->lock);
-               return stuck;
-       }
-       j->err_seq = journal_cur_seq(j);
-
-       __bch2_journal_debug_to_text(&buf, j);
-       spin_unlock(&j->lock);
-       prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"),
-                                 bch2_err_str(error));
-       bch2_print_str(c, KERN_ERR, buf.buf);
-
-       printbuf_reset(&buf);
-       bch2_journal_pins_to_text(&buf, j);
-       bch_err(c, "Journal pins:\n%s", buf.buf);
-       printbuf_exit(&buf);
-
-       bch2_fatal_error(c);
-       dump_stack();
-
-       return stuck;
-}
-
-void bch2_journal_do_writes(struct journal *j)
-{
-       for (u64 seq = journal_last_unwritten_seq(j);
-            seq <= journal_cur_seq(j);
-            seq++) {
-               unsigned idx = seq & JOURNAL_BUF_MASK;
-               struct journal_buf *w = j->buf + idx;
-
-               if (w->write_started && !w->write_allocated)
-                       break;
-               if (w->write_started)
-                       continue;
-
-               if (!journal_state_seq_count(j, j->reservations, seq)) {
-                       j->seq_write_started = seq;
-                       w->write_started = true;
-                       closure_call(&w->io, bch2_journal_write, j->wq, NULL);
-               }
-
-               break;
-       }
-}
-
-/*
- * Final processing when the last reference of a journal buffer has been
- * dropped. Drop the pin list reference acquired at journal entry open and write
- * the buffer, if requested.
- */
-void bch2_journal_buf_put_final(struct journal *j, u64 seq)
-{
-       lockdep_assert_held(&j->lock);
-
-       if (__bch2_journal_pin_put(j, seq))
-               bch2_journal_reclaim_fast(j);
-       bch2_journal_do_writes(j);
-
-       /*
-        * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an
-        * open journal entry
-        */
-       wake_up(&j->wait);
-}
-
-/*
- * Returns true if journal entry is now closed:
- *
- * We don't close a journal_buf until the next journal_buf is finished writing,
- * and can be opened again - this also initializes the next journal_buf:
- */
-static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_buf *buf = journal_cur_buf(j);
-       union journal_res_state old, new;
-       unsigned sectors;
-
-       BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
-              closed_val != JOURNAL_ENTRY_ERROR_VAL);
-
-       lockdep_assert_held(&j->lock);
-
-       old.v = atomic64_read(&j->reservations.counter);
-       do {
-               new.v = old.v;
-               new.cur_entry_offset = closed_val;
-
-               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
-                   old.cur_entry_offset == new.cur_entry_offset)
-                       return;
-       } while (!atomic64_try_cmpxchg(&j->reservations.counter,
-                                      &old.v, new.v));
-
-       if (!__journal_entry_is_open(old))
-               return;
-
-       if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)
-               old.cur_entry_offset = j->cur_entry_offset_if_blocked;
-
-       /* Close out old buffer: */
-       buf->data->u64s         = cpu_to_le32(old.cur_entry_offset);
-
-       if (trace_journal_entry_close_enabled() && trace) {
-               struct printbuf pbuf = PRINTBUF;
-               pbuf.atomic++;
-
-               prt_str(&pbuf, "entry size: ");
-               prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
-               prt_newline(&pbuf);
-               bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
-               trace_journal_entry_close(c, pbuf.buf);
-               printbuf_exit(&pbuf);
-       }
-
-       sectors = vstruct_blocks_plus(buf->data, c->block_bits,
-                                     buf->u64s_reserved) << c->block_bits;
-       if (unlikely(sectors > buf->sectors)) {
-               struct printbuf err = PRINTBUF;
-               err.atomic++;
-
-               prt_printf(&err, "journal entry overran reserved space: %u > %u\n",
-                          sectors, buf->sectors);
-               prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n",
-                          le32_to_cpu(buf->data->u64s), buf->u64s_reserved,
-                          j->cur_entry_u64s,
-                          c->block_bits);
-               prt_printf(&err, "fatal error - emergency read only");
-               bch2_journal_halt_locked(j);
-
-               bch_err(c, "%s", err.buf);
-               printbuf_exit(&err);
-               return;
-       }
-
-       buf->sectors = sectors;
-
-       /*
-        * We have to set last_seq here, _before_ opening a new journal entry:
-        *
-        * A threads may replace an old pin with a new pin on their current
-        * journal reservation - the expectation being that the journal will
-        * contain either what the old pin protected or what the new pin
-        * protects.
-        *
-        * After the old pin is dropped journal_last_seq() won't include the old
-        * pin, so we can only write the updated last_seq on the entry that
-        * contains whatever the new pin protects.
-        *
-        * Restated, we can _not_ update last_seq for a given entry if there
-        * could be a newer entry open with reservations/pins that have been
-        * taken against it.
-        *
-        * Hence, we want update/set last_seq on the current journal entry right
-        * before we open a new one:
-        */
-       buf->last_seq           = journal_last_seq(j);
-       buf->data->last_seq     = cpu_to_le64(buf->last_seq);
-       BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
-
-       cancel_delayed_work(&j->write_work);
-
-       bch2_journal_space_available(j);
-
-       __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq));
-}
-
-void bch2_journal_halt_locked(struct journal *j)
-{
-       lockdep_assert_held(&j->lock);
-
-       __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
-       if (!j->err_seq)
-               j->err_seq = journal_cur_seq(j);
-       journal_wake(j);
-}
-
-void bch2_journal_halt(struct journal *j)
-{
-       spin_lock(&j->lock);
-       bch2_journal_halt_locked(j);
-       spin_unlock(&j->lock);
-}
-
-static bool journal_entry_want_write(struct journal *j)
-{
-       bool ret = !journal_entry_is_open(j) ||
-               journal_cur_seq(j) == journal_last_unwritten_seq(j);
-
-       /* Don't close it yet if we already have a write in flight: */
-       if (ret)
-               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-       else if (nr_unwritten_journal_entries(j)) {
-               struct journal_buf *buf = journal_cur_buf(j);
-
-               if (!buf->flush_time) {
-                       buf->flush_time = local_clock() ?: 1;
-                       buf->expires = jiffies;
-               }
-       }
-
-       return ret;
-}
-
-bool bch2_journal_entry_close(struct journal *j)
-{
-       bool ret;
-
-       spin_lock(&j->lock);
-       ret = journal_entry_want_write(j);
-       spin_unlock(&j->lock);
-
-       return ret;
-}
-
-/*
- * should _only_ called from journal_res_get() - when we actually want a
- * journal reservation - journal entry is open means journal is dirty:
- */
-static int journal_entry_open(struct journal *j)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_buf *buf = j->buf +
-               ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
-       union journal_res_state old, new;
-       int u64s;
-
-       lockdep_assert_held(&j->lock);
-       BUG_ON(journal_entry_is_open(j));
-       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
-       if (j->blocked)
-               return bch_err_throw(c, journal_blocked);
-
-       if (j->cur_entry_error)
-               return j->cur_entry_error;
-
-       int ret = bch2_journal_error(j);
-       if (unlikely(ret))
-               return ret;
-
-       if (!fifo_free(&j->pin))
-               return bch_err_throw(c, journal_pin_full);
-
-       if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
-               return bch_err_throw(c, journal_max_in_flight);
-
-       if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
-               return bch_err_throw(c, journal_max_open);
-
-       if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) {
-               bch_err(c, "cannot start: journal seq overflow");
-               if (bch2_fs_emergency_read_only_locked(c))
-                       bch_err(c, "fatal error - emergency read only");
-               return bch_err_throw(c, journal_shutdown);
-       }
-
-       if (!j->free_buf && !buf->data)
-               return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */
-
-       BUG_ON(!j->cur_entry_sectors);
-
-       if (!buf->data) {
-               swap(buf->data,         j->free_buf);
-               swap(buf->buf_size,     j->free_buf_size);
-       }
-
-       buf->expires            =
-               (journal_cur_seq(j) == j->flushed_seq_ondisk
-                ? jiffies
-                : j->last_flush_write) +
-               msecs_to_jiffies(c->opts.journal_flush_delay);
-
-       buf->u64s_reserved      = j->entry_u64s_reserved;
-       buf->disk_sectors       = j->cur_entry_sectors;
-       buf->sectors            = min(buf->disk_sectors, buf->buf_size >> 9);
-
-       u64s = (int) (buf->sectors << 9) / sizeof(u64) -
-               journal_entry_overhead(j);
-       u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
-
-       if (u64s <= (ssize_t) j->early_journal_entries.nr)
-               return bch_err_throw(c, journal_full);
-
-       if (fifo_empty(&j->pin) && j->reclaim_thread)
-               wake_up_process(j->reclaim_thread);
-
-       /*
-        * The fifo_push() needs to happen at the same time as j->seq is
-        * incremented for journal_last_seq() to be calculated correctly
-        */
-       atomic64_inc(&j->seq);
-       journal_pin_list_init(fifo_push_ref(&j->pin), 1);
-
-       if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) {
-               bch_err(c, "attempting to open blacklisted journal seq %llu",
-                       journal_cur_seq(j));
-               if (bch2_fs_emergency_read_only_locked(c))
-                       bch_err(c, "fatal error - emergency read only");
-               return bch_err_throw(c, journal_shutdown);
-       }
-
-       BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
-
-       BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
-
-       bkey_extent_init(&buf->key);
-       buf->noflush            = false;
-       buf->must_flush         = false;
-       buf->separate_flush     = false;
-       buf->flush_time         = 0;
-       buf->need_flush_to_write_buffer = true;
-       buf->write_started      = false;
-       buf->write_allocated    = false;
-       buf->write_done         = false;
-
-       memset(buf->data, 0, sizeof(*buf->data));
-       buf->data->seq  = cpu_to_le64(journal_cur_seq(j));
-       buf->data->u64s = 0;
-
-       if (j->early_journal_entries.nr) {
-               memcpy(buf->data->_data, j->early_journal_entries.data,
-                      j->early_journal_entries.nr * sizeof(u64));
-               le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
-       }
-
-       /*
-        * Must be set before marking the journal entry as open:
-        */
-       j->cur_entry_u64s = u64s;
-
-       old.v = atomic64_read(&j->reservations.counter);
-       do {
-               new.v = old.v;
-
-               BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
-
-               new.idx++;
-               BUG_ON(journal_state_count(new, new.idx));
-               BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK));
-
-               journal_state_inc(&new);
-
-               /* Handle any already added entries */
-               new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
-       } while (!atomic64_try_cmpxchg(&j->reservations.counter,
-                                      &old.v, new.v));
-
-       if (nr_unwritten_journal_entries(j) == 1)
-               mod_delayed_work(j->wq,
-                                &j->write_work,
-                                msecs_to_jiffies(c->opts.journal_flush_delay));
-       journal_wake(j);
-
-       if (j->early_journal_entries.nr)
-               darray_exit(&j->early_journal_entries);
-       return 0;
-}
-
-static bool journal_quiesced(struct journal *j)
-{
-       bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
-
-       if (!ret)
-               bch2_journal_entry_close(j);
-       return ret;
-}
-
-static void journal_quiesce(struct journal *j)
-{
-       wait_event(j->wait, journal_quiesced(j));
-}
-
-static void journal_write_work(struct work_struct *work)
-{
-       struct journal *j = container_of(work, struct journal, write_work.work);
-
-       spin_lock(&j->lock);
-       if (__journal_entry_is_open(j->reservations)) {
-               long delta = journal_cur_buf(j)->expires - jiffies;
-
-               if (delta > 0)
-                       mod_delayed_work(j->wq, &j->write_work, delta);
-               else
-                       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-       }
-       spin_unlock(&j->lock);
-}
-
-static void journal_buf_prealloc(struct journal *j)
-{
-       if (j->free_buf &&
-           j->free_buf_size >= j->buf_size_want)
-               return;
-
-       unsigned buf_size = j->buf_size_want;
-
-       spin_unlock(&j->lock);
-       void *buf = kvmalloc(buf_size, GFP_NOFS);
-       spin_lock(&j->lock);
-
-       if (buf &&
-           (!j->free_buf ||
-            buf_size > j->free_buf_size)) {
-               swap(buf,       j->free_buf);
-               swap(buf_size,  j->free_buf_size);
-       }
-
-       if (unlikely(buf)) {
-               spin_unlock(&j->lock);
-               /* kvfree can sleep */
-               kvfree(buf);
-               spin_lock(&j->lock);
-       }
-}
-
-static int __journal_res_get(struct journal *j, struct journal_res *res,
-                            unsigned flags)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_buf *buf;
-       bool can_discard;
-       int ret;
-retry:
-       if (journal_res_get_fast(j, res, flags))
-               return 0;
-
-       ret = bch2_journal_error(j);
-       if (unlikely(ret))
-               return ret;
-
-       if (j->blocked)
-               return bch_err_throw(c, journal_blocked);
-
-       if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
-               ret = bch_err_throw(c, journal_full);
-               can_discard = j->can_discard;
-               goto out;
-       }
-
-       if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
-               ret = bch_err_throw(c, journal_max_in_flight);
-               goto out;
-       }
-
-       spin_lock(&j->lock);
-
-       journal_buf_prealloc(j);
-
-       /*
-        * Recheck after taking the lock, so we don't race with another thread
-        * that just did journal_entry_open() and call bch2_journal_entry_close()
-        * unnecessarily
-        */
-       if (journal_res_get_fast(j, res, flags)) {
-               ret = 0;
-               goto unlock;
-       }
-
-       /*
-        * If we couldn't get a reservation because the current buf filled up,
-        * and we had room for a bigger entry on disk, signal that we want to
-        * realloc the journal bufs:
-        */
-       buf = journal_cur_buf(j);
-       if (journal_entry_is_open(j) &&
-           buf->buf_size >> 9 < buf->disk_sectors &&
-           buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
-               j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
-
-       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
-       ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open;
-unlock:
-       can_discard = j->can_discard;
-       spin_unlock(&j->lock);
-out:
-       if (likely(!ret))
-               return 0;
-       if (ret == -BCH_ERR_journal_retry_open)
-               goto retry;
-
-       if (journal_error_check_stuck(j, ret, flags))
-               ret = bch_err_throw(c, journal_stuck);
-
-       if (ret == -BCH_ERR_journal_max_in_flight &&
-           track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
-           trace_journal_entry_full_enabled()) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_printbuf_make_room(&buf, 4096);
-
-               spin_lock(&j->lock);
-               prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
-               bch2_journal_bufs_to_text(&buf, j);
-               spin_unlock(&j->lock);
-
-               trace_journal_entry_full(c, buf.buf);
-               printbuf_exit(&buf);
-               count_event(c, journal_entry_full);
-       }
-
-       if (ret == -BCH_ERR_journal_max_open &&
-           track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) &&
-           trace_journal_entry_full_enabled()) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_printbuf_make_room(&buf, 4096);
-
-               spin_lock(&j->lock);
-               prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
-               bch2_journal_bufs_to_text(&buf, j);
-               spin_unlock(&j->lock);
-
-               trace_journal_entry_full(c, buf.buf);
-               printbuf_exit(&buf);
-               count_event(c, journal_entry_full);
-       }
-
-       /*
-        * Journal is full - can't rely on reclaim from work item due to
-        * freezing:
-        */
-       if ((ret == -BCH_ERR_journal_full ||
-            ret == -BCH_ERR_journal_pin_full) &&
-           !(flags & JOURNAL_RES_GET_NONBLOCK)) {
-               if (can_discard) {
-                       bch2_journal_do_discards(j);
-                       goto retry;
-               }
-
-               if (mutex_trylock(&j->reclaim_lock)) {
-                       bch2_journal_reclaim(j);
-                       mutex_unlock(&j->reclaim_lock);
-               }
-       }
-
-       return ret;
-}
-
-static unsigned max_dev_latency(struct bch_fs *c)
-{
-       u64 nsecs = 0;
-
-       guard(rcu)();
-       for_each_rw_member_rcu(c, ca)
-               nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
-
-       return nsecs_to_jiffies(nsecs);
-}
-
-/*
- * Essentially the entry function to the journaling code. When bcachefs is doing
- * a btree insert, it calls this function to get the current journal write.
- * Journal write is the structure used set up journal writes. The calling
- * function will then add its keys to the structure, queuing them for the next
- * write.
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks.
- */
-int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
-                                 unsigned flags,
-                                 struct btree_trans *trans)
-{
-       int ret;
-
-       if (closure_wait_event_timeout(&j->async_wait,
-                  !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
-                  (flags & JOURNAL_RES_GET_NONBLOCK),
-                  HZ))
-               return ret;
-
-       if (trans)
-               bch2_trans_unlock_long(trans);
-
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10);
-
-       remaining_wait = max(0, remaining_wait - HZ);
-
-       if (closure_wait_event_timeout(&j->async_wait,
-                  !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
-                  (flags & JOURNAL_RES_GET_NONBLOCK),
-                  remaining_wait))
-               return ret;
-
-       struct printbuf buf = PRINTBUF;
-       bch2_journal_debug_to_text(&buf, j);
-       bch2_print_str(c, KERN_ERR, buf.buf);
-       prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
-       printbuf_exit(&buf);
-
-       closure_wait_event(&j->async_wait,
-                  !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
-                  (flags & JOURNAL_RES_GET_NONBLOCK));
-       return ret;
-}
-
-/* journal_entry_res: */
-
-void bch2_journal_entry_res_resize(struct journal *j,
-                                  struct journal_entry_res *res,
-                                  unsigned new_u64s)
-{
-       union journal_res_state state;
-       int d = new_u64s - res->u64s;
-
-       spin_lock(&j->lock);
-
-       j->entry_u64s_reserved += d;
-       if (d <= 0)
-               goto out;
-
-       j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
-       state = READ_ONCE(j->reservations);
-
-       if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
-           state.cur_entry_offset > j->cur_entry_u64s) {
-               j->cur_entry_u64s += d;
-               /*
-                * Not enough room in current journal entry, have to flush it:
-                */
-               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-       } else {
-               journal_cur_buf(j)->u64s_reserved += d;
-       }
-out:
-       spin_unlock(&j->lock);
-       res->u64s += d;
-}
-
-/* journal flushing: */
-
-/**
- * bch2_journal_flush_seq_async - wait for a journal entry to be written
- * @j:         journal object
- * @seq:       seq to flush
- * @parent:    closure object to wait with
- * Returns:    1 if @seq has already been flushed, 0 if @seq is being flushed,
- *             -BCH_ERR_journal_flush_err if @seq will never be flushed
- *
- * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
- * necessary
- */
-int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
-                                struct closure *parent)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_buf *buf;
-       int ret = 0;
-
-       if (seq <= j->flushed_seq_ondisk)
-               return 1;
-
-       spin_lock(&j->lock);
-
-       if (WARN_ONCE(seq > journal_cur_seq(j),
-                     "requested to flush journal seq %llu, but currently at %llu",
-                     seq, journal_cur_seq(j)))
-               goto out;
-
-       /* Recheck under lock: */
-       if (j->err_seq && seq >= j->err_seq) {
-               ret = bch_err_throw(c, journal_flush_err);
-               goto out;
-       }
-
-       if (seq <= j->flushed_seq_ondisk) {
-               ret = 1;
-               goto out;
-       }
-
-       /* if seq was written, but not flushed - flush a newer one instead */
-       seq = max(seq, journal_last_unwritten_seq(j));
-
-recheck_need_open:
-       if (seq > journal_cur_seq(j)) {
-               struct journal_res res = { 0 };
-
-               if (journal_entry_is_open(j))
-                       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-
-               spin_unlock(&j->lock);
-
-               /*
-                * We're called from bch2_journal_flush_seq() -> wait_event();
-                * but this might block. We won't usually block, so we won't
-                * livelock:
-                */
-               sched_annotate_sleep();
-               ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
-               if (ret)
-                       return ret;
-
-               seq = res.seq;
-               buf = journal_seq_to_buf(j, seq);
-               buf->must_flush = true;
-
-               if (!buf->flush_time) {
-                       buf->flush_time = local_clock() ?: 1;
-                       buf->expires = jiffies;
-               }
-
-               if (parent && !closure_wait(&buf->wait, parent))
-                       BUG();
-
-               bch2_journal_res_put(j, &res);
-
-               spin_lock(&j->lock);
-               goto want_write;
-       }
-
-       /*
-        * if write was kicked off without a flush, or if we promised it
-        * wouldn't be a flush, flush the next sequence number instead
-        */
-       buf = journal_seq_to_buf(j, seq);
-       if (buf->noflush) {
-               seq++;
-               goto recheck_need_open;
-       }
-
-       buf->must_flush = true;
-       j->flushing_seq = max(j->flushing_seq, seq);
-
-       if (parent && !closure_wait(&buf->wait, parent))
-               BUG();
-want_write:
-       if (seq == journal_cur_seq(j))
-               journal_entry_want_write(j);
-out:
-       spin_unlock(&j->lock);
-       return ret;
-}
-
-int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state)
-{
-       u64 start_time = local_clock();
-       int ret, ret2;
-
-       /*
-        * Don't update time_stats when @seq is already flushed:
-        */
-       if (seq <= j->flushed_seq_ondisk)
-               return 0;
-
-       ret = wait_event_state(j->wait,
-                              (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)),
-                              task_state);
-
-       if (!ret)
-               bch2_time_stats_update(j->flush_seq_time, start_time);
-
-       return ret ?: ret2 < 0 ? ret2 : 0;
-}
-
-/*
- * bch2_journal_flush_async - if there is an open journal entry, or a journal
- * still being written, write it and wait for the write to complete
- */
-void bch2_journal_flush_async(struct journal *j, struct closure *parent)
-{
-       bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
-}
-
-int bch2_journal_flush(struct journal *j)
-{
-       return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE);
-}
-
-/*
- * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the
- * range [start, end)
- * @seq
- */
-bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       u64 unwritten_seq;
-       bool ret = false;
-
-       if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
-               return false;
-
-       if (c->journal.flushed_seq_ondisk >= start)
-               return false;
-
-       spin_lock(&j->lock);
-       if (c->journal.flushed_seq_ondisk >= start)
-               goto out;
-
-       for (unwritten_seq = journal_last_unwritten_seq(j);
-            unwritten_seq < end;
-            unwritten_seq++) {
-               struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
-
-               /* journal flush already in flight, or flush requseted */
-               if (buf->must_flush)
-                       goto out;
-
-               buf->noflush = true;
-       }
-
-       ret = true;
-out:
-       spin_unlock(&j->lock);
-       return ret;
-}
-
-static int __bch2_journal_meta(struct journal *j)
-{
-       struct journal_res res = {};
-       int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
-       if (ret)
-               return ret;
-
-       struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
-       buf->must_flush = true;
-
-       if (!buf->flush_time) {
-               buf->flush_time = local_clock() ?: 1;
-               buf->expires = jiffies;
-       }
-
-       bch2_journal_res_put(j, &res);
-
-       return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
-}
-
-int bch2_journal_meta(struct journal *j)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal))
-               return bch_err_throw(c, erofs_no_writes);
-
-       int ret = __bch2_journal_meta(j);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal);
-       return ret;
-}
-
-/* block/unlock the journal: */
-
-void bch2_journal_unblock(struct journal *j)
-{
-       spin_lock(&j->lock);
-       if (!--j->blocked &&
-           j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL &&
-           j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) {
-               union journal_res_state old, new;
-
-               old.v = atomic64_read(&j->reservations.counter);
-               do {
-                       new.v = old.v;
-                       new.cur_entry_offset = j->cur_entry_offset_if_blocked;
-               } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
-       }
-       spin_unlock(&j->lock);
-
-       journal_wake(j);
-}
-
-static void __bch2_journal_block(struct journal *j)
-{
-       if (!j->blocked++) {
-               union journal_res_state old, new;
-
-               old.v = atomic64_read(&j->reservations.counter);
-               do {
-                       j->cur_entry_offset_if_blocked = old.cur_entry_offset;
-
-                       if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL)
-                               break;
-
-                       new.v = old.v;
-                       new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
-               } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
-
-               if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL)
-                       journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
-       }
-}
-
-void bch2_journal_block(struct journal *j)
-{
-       spin_lock(&j->lock);
-       __bch2_journal_block(j);
-       spin_unlock(&j->lock);
-
-       journal_quiesce(j);
-}
-
-static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j,
-                                               u64 max_seq, bool *blocked)
-{
-       struct journal_buf *ret = NULL;
-
-       /* We're inside wait_event(), but using mutex_lock(: */
-       sched_annotate_sleep();
-       mutex_lock(&j->buf_lock);
-       spin_lock(&j->lock);
-       max_seq = min(max_seq, journal_cur_seq(j));
-
-       for (u64 seq = journal_last_unwritten_seq(j);
-            seq <= max_seq;
-            seq++) {
-               unsigned idx = seq & JOURNAL_BUF_MASK;
-               struct journal_buf *buf = j->buf + idx;
-
-               if (buf->need_flush_to_write_buffer) {
-                       union journal_res_state s;
-                       s.v = atomic64_read_acquire(&j->reservations.counter);
-
-                       unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s);
-
-                       if (open && !*blocked) {
-                               __bch2_journal_block(j);
-                               s.v = atomic64_read_acquire(&j->reservations.counter);
-                               *blocked = true;
-                       }
-
-                       ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
-                               ? ERR_PTR(-EAGAIN)
-                               : buf;
-                       break;
-               }
-       }
-
-       spin_unlock(&j->lock);
-       if (IS_ERR_OR_NULL(ret))
-               mutex_unlock(&j->buf_lock);
-       return ret;
-}
-
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
-                                                            u64 max_seq, bool *blocked)
-{
-       struct journal_buf *ret;
-       *blocked = false;
-
-       wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j,
-                                               max_seq, blocked)) != ERR_PTR(-EAGAIN));
-       if (IS_ERR_OR_NULL(ret) && *blocked)
-               bch2_journal_unblock(j);
-
-       return ret;
-}
-
-/* allocate journal on a device: */
-
-static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
-                                           bool new_fs, struct closure *cl)
-{
-       struct bch_fs *c = ca->fs;
-       struct journal_device *ja = &ca->journal;
-       u64 *new_bucket_seq = NULL, *new_buckets = NULL;
-       struct open_bucket **ob = NULL;
-       long *bu = NULL;
-       unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
-       int ret = 0;
-
-       BUG_ON(nr <= ja->nr);
-
-       bu              = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
-       ob              = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
-       new_buckets     = kcalloc(nr, sizeof(u64), GFP_KERNEL);
-       new_bucket_seq  = kcalloc(nr, sizeof(u64), GFP_KERNEL);
-       if (!bu || !ob || !new_buckets || !new_bucket_seq) {
-               ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
-               goto err_free;
-       }
-
-       for (nr_got = 0; nr_got < nr_want; nr_got++) {
-               enum bch_watermark watermark = new_fs
-                       ? BCH_WATERMARK_btree
-                       : BCH_WATERMARK_normal;
-
-               ob[nr_got] = bch2_bucket_alloc(c, ca, watermark,
-                                              BCH_DATA_journal, cl);
-               ret = PTR_ERR_OR_ZERO(ob[nr_got]);
-               if (ret)
-                       break;
-
-               if (!new_fs) {
-                       ret = bch2_trans_run(c,
-                               bch2_trans_mark_metadata_bucket(trans, ca,
-                                               ob[nr_got]->bucket, BCH_DATA_journal,
-                                               ca->mi.bucket_size, BTREE_TRIGGER_transactional));
-                       if (ret) {
-                               bch2_open_bucket_put(c, ob[nr_got]);
-                               bch_err_msg(c, ret, "marking new journal buckets");
-                               break;
-                       }
-               }
-
-               bu[nr_got] = ob[nr_got]->bucket;
-       }
-
-       if (!nr_got)
-               goto err_free;
-
-       /* Don't return an error if we successfully allocated some buckets: */
-       ret = 0;
-
-       if (c) {
-               bch2_journal_flush_all_pins(&c->journal);
-               bch2_journal_block(&c->journal);
-               mutex_lock(&c->sb_lock);
-       }
-
-       memcpy(new_buckets,     ja->buckets,    ja->nr * sizeof(u64));
-       memcpy(new_bucket_seq,  ja->bucket_seq, ja->nr * sizeof(u64));
-
-       BUG_ON(ja->discard_idx > ja->nr);
-
-       pos = ja->discard_idx ?: ja->nr;
-
-       memmove(new_buckets + pos + nr_got,
-               new_buckets + pos,
-               sizeof(new_buckets[0]) * (ja->nr - pos));
-       memmove(new_bucket_seq + pos + nr_got,
-               new_bucket_seq + pos,
-               sizeof(new_bucket_seq[0]) * (ja->nr - pos));
-
-       for (i = 0; i < nr_got; i++) {
-               new_buckets[pos + i] = bu[i];
-               new_bucket_seq[pos + i] = 0;
-       }
-
-       nr = ja->nr + nr_got;
-
-       ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
-       if (ret)
-               goto err_unblock;
-
-       bch2_write_super(c);
-
-       /* Commit: */
-       if (c)
-               spin_lock(&c->journal.lock);
-
-       swap(new_buckets,       ja->buckets);
-       swap(new_bucket_seq,    ja->bucket_seq);
-       ja->nr = nr;
-
-       if (pos <= ja->discard_idx)
-               ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
-       if (pos <= ja->dirty_idx_ondisk)
-               ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
-       if (pos <= ja->dirty_idx)
-               ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
-       if (pos <= ja->cur_idx)
-               ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
-
-       if (c)
-               spin_unlock(&c->journal.lock);
-err_unblock:
-       if (c) {
-               bch2_journal_unblock(&c->journal);
-               mutex_unlock(&c->sb_lock);
-       }
-
-       if (ret && !new_fs)
-               for (i = 0; i < nr_got; i++)
-                       bch2_trans_run(c,
-                               bch2_trans_mark_metadata_bucket(trans, ca,
-                                               bu[i], BCH_DATA_free, 0,
-                                               BTREE_TRIGGER_transactional));
-err_free:
-       for (i = 0; i < nr_got; i++)
-               bch2_open_bucket_put(c, ob[i]);
-
-       kfree(new_bucket_seq);
-       kfree(new_buckets);
-       kfree(ob);
-       kfree(bu);
-       return ret;
-}
-
-static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca,
-                                           unsigned nr, bool new_fs)
-{
-       struct journal_device *ja = &ca->journal;
-       int ret = 0;
-
-       struct closure cl;
-       closure_init_stack(&cl);
-
-       /* don't handle reducing nr of buckets yet: */
-       if (nr < ja->nr)
-               return 0;
-
-       while (!ret && ja->nr < nr) {
-               struct disk_reservation disk_res = { 0, 0, 0 };
-
-               /*
-                * note: journal buckets aren't really counted as _sectors_ used yet, so
-                * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
-                * when space used goes up without a reservation - but we do need the
-                * reservation to ensure we'll actually be able to allocate:
-                *
-                * XXX: that's not right, disk reservations only ensure a
-                * filesystem-wide allocation will succeed, this is a device
-                * specific allocation - we can hang here:
-                */
-               if (!new_fs) {
-                       ret = bch2_disk_reservation_get(c, &disk_res,
-                                                       bucket_to_sector(ca, nr - ja->nr), 1, 0);
-                       if (ret)
-                               break;
-               }
-
-               ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
-
-               if (ret == -BCH_ERR_bucket_alloc_blocked ||
-                   ret == -BCH_ERR_open_buckets_empty)
-                       ret = 0; /* wait and retry */
-
-               bch2_disk_reservation_put(c, &disk_res);
-               bch2_wait_on_allocator(c, &cl);
-       }
-
-       return ret;
-}
-
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
-                               unsigned nr)
-{
-       down_write(&c->state_lock);
-       int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false);
-       up_write(&c->state_lock);
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b)
-{
-       struct bch_fs *c = ca->fs;
-       struct journal *j = &c->journal;
-       struct journal_device *ja = &ca->journal;
-
-       guard(mutex)(&c->sb_lock);
-       unsigned pos;
-       for (pos = 0; pos < ja->nr; pos++)
-               if (ja->buckets[pos] == b)
-                       break;
-
-       if (pos == ja->nr) {
-               bch_err(ca, "journal bucket %llu not found when deleting", b);
-               return -EINVAL;
-       }
-
-       u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);;
-       if (!new_buckets)
-               return bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
-
-       memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
-       memmove(&new_buckets[pos],
-               &new_buckets[pos + 1],
-               (ja->nr - 1 - pos) * sizeof(new_buckets[0]));
-
-       int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?:
-               bch2_write_super(c);
-       if (ret) {
-               kfree(new_buckets);
-               return ret;
-       }
-
-       scoped_guard(spinlock, &j->lock) {
-               if (pos < ja->discard_idx)
-                       --ja->discard_idx;
-               if (pos < ja->dirty_idx_ondisk)
-                       --ja->dirty_idx_ondisk;
-               if (pos < ja->dirty_idx)
-                       --ja->dirty_idx;
-               if (pos < ja->cur_idx)
-                       --ja->cur_idx;
-
-               ja->nr--;
-
-               memmove(&ja->buckets[pos],
-                       &ja->buckets[pos + 1],
-                       (ja->nr - pos) * sizeof(ja->buckets[0]));
-
-               memmove(&ja->bucket_seq[pos],
-                       &ja->bucket_seq[pos + 1],
-                       (ja->nr - pos) * sizeof(ja->bucket_seq[0]));
-
-               bch2_journal_space_available(j);
-       }
-
-       kfree(new_buckets);
-       return 0;
-}
-
-int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
-{
-       struct bch_fs *c = ca->fs;
-
-       if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal)))
-               return 0;
-
-       if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
-               bch_err(c, "cannot allocate journal, filesystem is an unresized image file");
-               return bch_err_throw(c, erofs_filesystem_full);
-       }
-
-       unsigned nr;
-       int ret;
-
-       if (dynamic_fault("bcachefs:add:journal_alloc")) {
-               ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
-               goto err;
-       }
-
-       /* 1/128th of the device by default: */
-       nr = ca->mi.nbuckets >> 7;
-
-       /*
-        * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
-        * is smaller:
-        */
-       nr = clamp_t(unsigned, nr,
-                    BCH_JOURNAL_BUCKETS_MIN,
-                    min(1 << 13,
-                        (1 << 24) / ca->mi.bucket_size));
-
-       ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs);
-err:
-       bch_err_fn(ca, ret);
-       return ret;
-}
-
-int bch2_fs_journal_alloc(struct bch_fs *c)
-{
-       for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) {
-               if (ca->journal.nr)
-                       continue;
-
-               int ret = bch2_dev_journal_alloc(ca, true);
-               if (ret) {
-                       enumerated_ref_put(&ca->io_ref[READ],
-                                          BCH_DEV_READ_REF_fs_journal_alloc);
-                       return ret;
-               }
-       }
-
-       return 0;
-}
-
-/* startup/shutdown: */
-
-static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
-{
-       bool ret = false;
-       u64 seq;
-
-       spin_lock(&j->lock);
-       for (seq = journal_last_unwritten_seq(j);
-            seq <= journal_cur_seq(j) && !ret;
-            seq++) {
-               struct journal_buf *buf = journal_seq_to_buf(j, seq);
-
-               if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
-                       ret = true;
-       }
-       spin_unlock(&j->lock);
-
-       return ret;
-}
-
-void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
-{
-       wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
-}
-
-void bch2_fs_journal_stop(struct journal *j)
-{
-       if (!test_bit(JOURNAL_running, &j->flags))
-               return;
-
-       bch2_journal_reclaim_stop(j);
-       bch2_journal_flush_all_pins(j);
-
-       wait_event(j->wait, bch2_journal_entry_close(j));
-
-       /*
-        * Always write a new journal entry, to make sure the clock hands are up
-        * to date (and match the superblock)
-        */
-       __bch2_journal_meta(j);
-
-       journal_quiesce(j);
-       cancel_delayed_work_sync(&j->write_work);
-
-       WARN(!bch2_journal_error(j) &&
-            test_bit(JOURNAL_replay_done, &j->flags) &&
-            j->last_empty_seq != journal_cur_seq(j),
-            "journal shutdown error: cur seq %llu but last empty seq %llu",
-            journal_cur_seq(j), j->last_empty_seq);
-
-       if (!bch2_journal_error(j))
-               clear_bit(JOURNAL_running, &j->flags);
-}
-
-int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_entry_pin_list *p;
-       struct journal_replay *i, **_i;
-       struct genradix_iter iter;
-       bool had_entries = false;
-
-       /*
-        *
-        * XXX pick most recent non blacklisted sequence number
-        */
-
-       cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c));
-
-       if (cur_seq >= JOURNAL_SEQ_MAX) {
-               bch_err(c, "cannot start: journal seq overflow");
-               return -EINVAL;
-       }
-
-       /* Clean filesystem? */
-       if (!last_seq)
-               last_seq = cur_seq;
-
-       u64 nr = cur_seq - last_seq;
-
-       /*
-        * Extra fudge factor, in case we crashed when the journal pin fifo was
-        * nearly or completely full. We'll need to be able to open additional
-        * journal entries (at least a few) in order for journal replay to get
-        * going:
-        */
-       nr += nr / 4;
-
-       nr = max(nr, JOURNAL_PIN);
-       init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
-       if (!j->pin.data) {
-               bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
-               return bch_err_throw(c, ENOMEM_journal_pin_fifo);
-       }
-
-       j->replay_journal_seq   = last_seq;
-       j->replay_journal_seq_end = cur_seq;
-       j->last_seq_ondisk      = last_seq;
-       j->flushed_seq_ondisk   = cur_seq - 1;
-       j->seq_write_started    = cur_seq - 1;
-       j->seq_ondisk           = cur_seq - 1;
-       j->pin.front            = last_seq;
-       j->pin.back             = cur_seq;
-       atomic64_set(&j->seq, cur_seq - 1);
-
-       u64 seq;
-       fifo_for_each_entry_ptr(p, &j->pin, seq)
-               journal_pin_list_init(p, 1);
-
-       genradix_for_each(&c->journal_entries, iter, _i) {
-               i = *_i;
-
-               if (journal_replay_ignore(i))
-                       continue;
-
-               seq = le64_to_cpu(i->j.seq);
-               BUG_ON(seq >= cur_seq);
-
-               if (seq < last_seq)
-                       continue;
-
-               if (journal_entry_empty(&i->j))
-                       j->last_empty_seq = le64_to_cpu(i->j.seq);
-
-               p = journal_seq_pin(j, seq);
-
-               p->devs.nr = 0;
-               darray_for_each(i->ptrs, ptr)
-                       bch2_dev_list_add_dev(&p->devs, ptr->dev);
-
-               had_entries = true;
-       }
-
-       if (!had_entries)
-               j->last_empty_seq = cur_seq - 1; /* to match j->seq */
-
-       spin_lock(&j->lock);
-       j->last_flush_write = jiffies;
-
-       j->reservations.idx = journal_cur_seq(j);
-
-       c->last_bucket_seq_cleanup = journal_cur_seq(j);
-       spin_unlock(&j->lock);
-
-       return 0;
-}
-
-void bch2_journal_set_replay_done(struct journal *j)
-{
-       /*
-        * journal_space_available must happen before setting JOURNAL_running
-        * JOURNAL_running must happen before JOURNAL_replay_done
-        */
-       spin_lock(&j->lock);
-       bch2_journal_space_available(j);
-
-       set_bit(JOURNAL_need_flush_write, &j->flags);
-       set_bit(JOURNAL_running, &j->flags);
-       set_bit(JOURNAL_replay_done, &j->flags);
-       spin_unlock(&j->lock);
-}
-
-/* init/exit: */
-
-void bch2_dev_journal_exit(struct bch_dev *ca)
-{
-       struct journal_device *ja = &ca->journal;
-
-       for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
-               kfree(ja->bio[i]);
-               ja->bio[i] = NULL;
-       }
-
-       kfree(ja->buckets);
-       kfree(ja->bucket_seq);
-       ja->buckets     = NULL;
-       ja->bucket_seq  = NULL;
-}
-
-int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
-{
-       struct bch_fs *c = ca->fs;
-       struct journal_device *ja = &ca->journal;
-       struct bch_sb_field_journal *journal_buckets =
-               bch2_sb_field_get(sb, journal);
-       struct bch_sb_field_journal_v2 *journal_buckets_v2 =
-               bch2_sb_field_get(sb, journal_v2);
-
-       ja->nr = 0;
-
-       if (journal_buckets_v2) {
-               unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-
-               for (unsigned i = 0; i < nr; i++)
-                       ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
-       } else if (journal_buckets) {
-               ja->nr = bch2_nr_journal_buckets(journal_buckets);
-       }
-
-       ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
-       if (!ja->bucket_seq)
-               return bch_err_throw(c, ENOMEM_dev_journal_init);
-
-       unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
-
-       for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
-               ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
-                                    nr_bvecs), GFP_KERNEL);
-               if (!ja->bio[i])
-                       return bch_err_throw(c, ENOMEM_dev_journal_init);
-
-               ja->bio[i]->ca = ca;
-               ja->bio[i]->buf_idx = i;
-               bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
-       }
-
-       ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
-       if (!ja->buckets)
-               return bch_err_throw(c, ENOMEM_dev_journal_init);
-
-       if (journal_buckets_v2) {
-               unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-               unsigned dst = 0;
-
-               for (unsigned i = 0; i < nr; i++)
-                       for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
-                               ja->buckets[dst++] =
-                                       le64_to_cpu(journal_buckets_v2->d[i].start) + j;
-       } else if (journal_buckets) {
-               for (unsigned i = 0; i < ja->nr; i++)
-                       ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
-       }
-
-       return 0;
-}
-
-void bch2_fs_journal_exit(struct journal *j)
-{
-       if (j->wq)
-               destroy_workqueue(j->wq);
-
-       darray_exit(&j->early_journal_entries);
-
-       for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
-               kvfree(j->buf[i].data);
-       kvfree(j->free_buf);
-       free_fifo(&j->pin);
-}
-
-void bch2_fs_journal_init_early(struct journal *j)
-{
-       static struct lock_class_key res_key;
-
-       mutex_init(&j->buf_lock);
-       spin_lock_init(&j->lock);
-       spin_lock_init(&j->err_lock);
-       init_waitqueue_head(&j->wait);
-       INIT_DELAYED_WORK(&j->write_work, journal_write_work);
-       init_waitqueue_head(&j->reclaim_wait);
-       init_waitqueue_head(&j->pin_flush_wait);
-       mutex_init(&j->reclaim_lock);
-       mutex_init(&j->discard_lock);
-
-       lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
-       atomic64_set(&j->reservations.counter,
-               ((union journal_res_state)
-                { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-}
-
-int bch2_fs_journal_init(struct journal *j)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-       j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
-       j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
-       if (!j->free_buf)
-               return bch_err_throw(c, ENOMEM_journal_buf);
-
-       for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
-               j->buf[i].idx = i;
-
-       j->wq = alloc_workqueue("bcachefs_journal",
-                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
-       if (!j->wq)
-               return bch_err_throw(c, ENOMEM_fs_other_alloc);
-       return 0;
-}
-
-/* debug: */
-
-static const char * const bch2_journal_flags_strs[] = {
-#define x(n)   #n,
-       JOURNAL_FLAGS()
-#undef x
-       NULL
-};
-
-void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       union journal_res_state s;
-       unsigned long now = jiffies;
-       u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
-
-       printbuf_tabstops_reset(out);
-       printbuf_tabstop_push(out, 28);
-       out->atomic++;
-
-       guard(rcu)();
-       s = READ_ONCE(j->reservations);
-
-       prt_printf(out, "flags:\t");
-       prt_bitflags(out, bch2_journal_flags_strs, j->flags);
-       prt_newline(out);
-       prt_printf(out, "dirty journal entries:\t%llu/%llu\n",  fifo_used(&j->pin), j->pin.size);
-       prt_printf(out, "seq:\t%llu\n",                         journal_cur_seq(j));
-       prt_printf(out, "seq_ondisk:\t%llu\n",                  j->seq_ondisk);
-       prt_printf(out, "last_seq:\t%llu\n",                    journal_last_seq(j));
-       prt_printf(out, "last_seq_ondisk:\t%llu\n",             j->last_seq_ondisk);
-       prt_printf(out, "flushed_seq_ondisk:\t%llu\n",          j->flushed_seq_ondisk);
-       prt_printf(out, "watermark:\t%s\n",                     bch2_watermarks[j->watermark]);
-       prt_printf(out, "each entry reserved:\t%u\n",           j->entry_u64s_reserved);
-       prt_printf(out, "nr flush writes:\t%llu\n",             j->nr_flush_writes);
-       prt_printf(out, "nr noflush writes:\t%llu\n",           j->nr_noflush_writes);
-       prt_printf(out, "average write size:\t");
-       prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
-       prt_newline(out);
-       prt_printf(out, "free buf:\t%u\n",                      j->free_buf ? j->free_buf_size : 0);
-       prt_printf(out, "nr direct reclaim:\t%llu\n",           j->nr_direct_reclaim);
-       prt_printf(out, "nr background reclaim:\t%llu\n",       j->nr_background_reclaim);
-       prt_printf(out, "reclaim kicked:\t%u\n",                j->reclaim_kicked);
-       prt_printf(out, "reclaim runs in:\t%u ms\n",            time_after(j->next_reclaim, now)
-              ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
-       prt_printf(out, "blocked:\t%u\n",                       j->blocked);
-       prt_printf(out, "current entry sectors:\t%u\n",         j->cur_entry_sectors);
-       prt_printf(out, "current entry error:\t%s\n",           bch2_err_str(j->cur_entry_error));
-       prt_printf(out, "current entry:\t");
-
-       switch (s.cur_entry_offset) {
-       case JOURNAL_ENTRY_ERROR_VAL:
-               prt_printf(out, "error\n");
-               break;
-       case JOURNAL_ENTRY_CLOSED_VAL:
-               prt_printf(out, "closed\n");
-               break;
-       case JOURNAL_ENTRY_BLOCKED_VAL:
-               prt_printf(out, "blocked\n");
-               break;
-       default:
-               prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
-               break;
-       }
-
-       prt_printf(out, "unwritten entries:\n");
-       bch2_journal_bufs_to_text(out, j);
-
-       prt_printf(out, "space:\n");
-       printbuf_indent_add(out, 2);
-       prt_printf(out, "discarded\t%u:%u\n",
-              j->space[journal_space_discarded].next_entry,
-              j->space[journal_space_discarded].total);
-       prt_printf(out, "clean ondisk\t%u:%u\n",
-              j->space[journal_space_clean_ondisk].next_entry,
-              j->space[journal_space_clean_ondisk].total);
-       prt_printf(out, "clean\t%u:%u\n",
-              j->space[journal_space_clean].next_entry,
-              j->space[journal_space_clean].total);
-       prt_printf(out, "total\t%u:%u\n",
-              j->space[journal_space_total].next_entry,
-              j->space[journal_space_total].total);
-       printbuf_indent_sub(out, 2);
-
-       for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
-               if (!ca->mi.durability)
-                       continue;
-
-               struct journal_device *ja = &ca->journal;
-
-               if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
-                       continue;
-
-               if (!ja->nr)
-                       continue;
-
-               prt_printf(out, "dev %u:\n",                    ca->dev_idx);
-               prt_printf(out, "durability %u:\n",             ca->mi.durability);
-               printbuf_indent_add(out, 2);
-               prt_printf(out, "nr\t%u\n",                     ja->nr);
-               prt_printf(out, "bucket size\t%u\n",            ca->mi.bucket_size);
-               prt_printf(out, "available\t%u:%u\n",           bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
-               prt_printf(out, "discard_idx\t%u\n",            ja->discard_idx);
-               prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk,   ja->bucket_seq[ja->dirty_idx_ondisk]);
-               prt_printf(out, "dirty_idx\t%u (seq %llu)\n",   ja->dirty_idx,          ja->bucket_seq[ja->dirty_idx]);
-               prt_printf(out, "cur_idx\t%u (seq %llu)\n",     ja->cur_idx,            ja->bucket_seq[ja->cur_idx]);
-               printbuf_indent_sub(out, 2);
-       }
-
-       prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
-
-       --out->atomic;
-}
-
-void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
-{
-       spin_lock(&j->lock);
-       __bch2_journal_debug_to_text(out, j);
-       spin_unlock(&j->lock);
-}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
deleted file mode 100644 (file)
index 9779070..0000000
+++ /dev/null
@@ -1,465 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_H
-#define _BCACHEFS_JOURNAL_H
-
-/*
- * THE JOURNAL:
- *
- * The primary purpose of the journal is to log updates (insertions) to the
- * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
- *
- * Without the journal, the b-tree is always internally consistent on
- * disk - and in fact, in the earliest incarnations bcache didn't have a journal
- * but did handle unclean shutdowns by doing all index updates synchronously
- * (with coalescing).
- *
- * Updates to interior nodes still happen synchronously and without the journal
- * (for simplicity) - this may change eventually but updates to interior nodes
- * are rare enough it's not a huge priority.
- *
- * This means the journal is relatively separate from the b-tree; it consists of
- * just a list of keys and journal replay consists of just redoing those
- * insertions in same order that they appear in the journal.
- *
- * PERSISTENCE:
- *
- * For synchronous updates (where we're waiting on the index update to hit
- * disk), the journal entry will be written out immediately (or as soon as
- * possible, if the write for the previous journal entry was still in flight).
- *
- * Synchronous updates are specified by passing a closure (@flush_cl) to
- * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
- * down to the journalling code. That closure will wait on the journal write to
- * complete (via closure_wait()).
- *
- * If the index update wasn't synchronous, the journal entry will be
- * written out after 10 ms have elapsed, by default (the delay_ms field
- * in struct journal).
- *
- * JOURNAL ENTRIES:
- *
- * A journal entry is variable size (struct jset), it's got a fixed length
- * header and then a variable number of struct jset_entry entries.
- *
- * Journal entries are identified by monotonically increasing 64 bit sequence
- * numbers - jset->seq; other places in the code refer to this sequence number.
- *
- * A jset_entry entry contains one or more bkeys (which is what gets inserted
- * into the b-tree). We need a container to indicate which b-tree the key is
- * for; also, the roots of the various b-trees are stored in jset_entry entries
- * (one for each b-tree) - this lets us add new b-tree types without changing
- * the on disk format.
- *
- * We also keep some things in the journal header that are logically part of the
- * superblock - all the things that are frequently updated. This is for future
- * bcache on raw flash support; the superblock (which will become another
- * journal) can't be moved or wear leveled, so it contains just enough
- * information to find the main journal, and the superblock only has to be
- * rewritten when we want to move/wear level the main journal.
- *
- * JOURNAL LAYOUT ON DISK:
- *
- * The journal is written to a ringbuffer of buckets (which is kept in the
- * superblock); the individual buckets are not necessarily contiguous on disk
- * which means that journal entries are not allowed to span buckets, but also
- * that we can resize the journal at runtime if desired (unimplemented).
- *
- * The journal buckets exist in the same pool as all the other buckets that are
- * managed by the allocator and garbage collection - garbage collection marks
- * the journal buckets as metadata buckets.
- *
- * OPEN/DIRTY JOURNAL ENTRIES:
- *
- * Open/dirty journal entries are journal entries that contain b-tree updates
- * that have not yet been written out to the b-tree on disk. We have to track
- * which journal entries are dirty, and we also have to avoid wrapping around
- * the journal and overwriting old but still dirty journal entries with new
- * journal entries.
- *
- * On disk, this is represented with the "last_seq" field of struct jset;
- * last_seq is the first sequence number that journal replay has to replay.
- *
- * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
- * journal_device->seq) of for each journal bucket, the highest sequence number
- * any journal entry it contains. Then, by comparing that against last_seq we
- * can determine whether that journal bucket contains dirty journal entries or
- * not.
- *
- * To track which journal entries are dirty, we maintain a fifo of refcounts
- * (where each entry corresponds to a specific sequence number) - when a ref
- * goes to 0, that journal entry is no longer dirty.
- *
- * Journalling of index updates is done at the same time as the b-tree itself is
- * being modified (see btree_insert_key()); when we add the key to the journal
- * the pending b-tree write takes a ref on the journal entry the key was added
- * to. If a pending b-tree write would need to take refs on multiple dirty
- * journal entries, it only keeps the ref on the oldest one (since a newer
- * journal entry will still be replayed if an older entry was dirty).
- *
- * JOURNAL FILLING UP:
- *
- * There are two ways the journal could fill up; either we could run out of
- * space to write to, or we could have too many open journal entries and run out
- * of room in the fifo of refcounts. Since those refcounts are decremented
- * without any locking we can't safely resize that fifo, so we handle it the
- * same way.
- *
- * If the journal fills up, we start flushing dirty btree nodes until we can
- * allocate space for a journal write again - preferentially flushing btree
- * nodes that are pinning the oldest journal entries first.
- */
-
-#include <linux/hash.h>
-
-#include "journal_types.h"
-
-struct bch_fs;
-
-static inline void journal_wake(struct journal *j)
-{
-       wake_up(&j->wait);
-       closure_wake_up(&j->async_wait);
-}
-
-/* Sequence number of oldest dirty journal entry */
-
-static inline u64 journal_last_seq(struct journal *j)
-{
-       return j->pin.front;
-}
-
-static inline u64 journal_cur_seq(struct journal *j)
-{
-       return atomic64_read(&j->seq);
-}
-
-static inline u64 journal_last_unwritten_seq(struct journal *j)
-{
-       return j->seq_ondisk + 1;
-}
-
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
-{
-       unsigned idx = (journal_cur_seq(j) &
-                       JOURNAL_BUF_MASK &
-                       ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx;
-
-       return j->buf + idx;
-}
-
-static inline int journal_state_count(union journal_res_state s, int idx)
-{
-       switch (idx) {
-       case 0: return s.buf0_count;
-       case 1: return s.buf1_count;
-       case 2: return s.buf2_count;
-       case 3: return s.buf3_count;
-       }
-       BUG();
-}
-
-static inline int journal_state_seq_count(struct journal *j,
-                                         union journal_res_state s, u64 seq)
-{
-       if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR)
-               return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK);
-       else
-               return 0;
-}
-
-static inline void journal_state_inc(union journal_res_state *s)
-{
-       s->buf0_count += s->idx == 0;
-       s->buf1_count += s->idx == 1;
-       s->buf2_count += s->idx == 2;
-       s->buf3_count += s->idx == 3;
-}
-
-/*
- * Amount of space that will be taken up by some keys in the journal (i.e.
- * including the jset header)
- */
-static inline unsigned jset_u64s(unsigned u64s)
-{
-       return u64s + sizeof(struct jset_entry) / sizeof(u64);
-}
-
-static inline int journal_entry_overhead(struct journal *j)
-{
-       return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
-}
-
-static inline struct jset_entry *
-bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
-{
-       struct jset *jset = buf->data;
-       struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
-
-       memset(entry, 0, sizeof(*entry));
-       entry->u64s = cpu_to_le16(u64s);
-
-       le32_add_cpu(&jset->u64s, jset_u64s(u64s));
-
-       return entry;
-}
-
-static inline struct jset_entry *
-journal_res_entry(struct journal *j, struct journal_res *res)
-{
-       return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset);
-}
-
-static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
-                                         enum btree_id id, unsigned level,
-                                         unsigned u64s)
-{
-       entry->u64s     = cpu_to_le16(u64s);
-       entry->btree_id = id;
-       entry->level    = level;
-       entry->type     = type;
-       entry->pad[0]   = 0;
-       entry->pad[1]   = 0;
-       entry->pad[2]   = 0;
-       return jset_u64s(u64s);
-}
-
-static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
-                                         enum btree_id id, unsigned level,
-                                         const void *data, unsigned u64s)
-{
-       unsigned ret = journal_entry_init(entry, type, id, level, u64s);
-
-       memcpy_u64s_small(entry->_data, data, u64s);
-       return ret;
-}
-
-static inline struct jset_entry *
-bch2_journal_add_entry(struct journal *j, struct journal_res *res,
-                        unsigned type, enum btree_id id,
-                        unsigned level, unsigned u64s)
-{
-       struct jset_entry *entry = journal_res_entry(j, res);
-       unsigned actual = journal_entry_init(entry, type, id, level, u64s);
-
-       EBUG_ON(!res->ref);
-       EBUG_ON(actual > res->u64s);
-
-       res->offset     += actual;
-       res->u64s       -= actual;
-       return entry;
-}
-
-static inline bool journal_entry_empty(struct jset *j)
-{
-       if (j->seq != j->last_seq)
-               return false;
-
-       vstruct_for_each(j, i)
-               if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
-                       return false;
-       return true;
-}
-
-/*
- * Drop reference on a buffer index and return true if the count has hit zero.
- */
-static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
-{
-       union journal_res_state s;
-
-       s.v = atomic64_sub_return(((union journal_res_state) {
-                                   .buf0_count = idx == 0,
-                                   .buf1_count = idx == 1,
-                                   .buf2_count = idx == 2,
-                                   .buf3_count = idx == 3,
-                                   }).v, &j->reservations.counter);
-       return s;
-}
-
-bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_do_writes(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64);
-
-static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
-{
-       unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
-       union journal_res_state s;
-
-       s = journal_state_buf_put(j, idx);
-       if (!journal_state_count(s, idx))
-               bch2_journal_buf_put_final(j, seq);
-}
-
-static inline void bch2_journal_buf_put(struct journal *j, u64 seq)
-{
-       unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
-       union journal_res_state s;
-
-       s = journal_state_buf_put(j, idx);
-       if (!journal_state_count(s, idx)) {
-               spin_lock(&j->lock);
-               bch2_journal_buf_put_final(j, seq);
-               spin_unlock(&j->lock);
-       } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL))
-               wake_up(&j->wait);
-}
-
-/*
- * This function releases the journal write structure so other threads can
- * then proceed to add their keys as well.
- */
-static inline void bch2_journal_res_put(struct journal *j,
-                                      struct journal_res *res)
-{
-       if (!res->ref)
-               return;
-
-       lock_release(&j->res_map, _THIS_IP_);
-
-       while (res->u64s)
-               bch2_journal_add_entry(j, res,
-                                      BCH_JSET_ENTRY_btree_keys,
-                                      0, 0, 0);
-
-       bch2_journal_buf_put(j, res->seq);
-
-       res->ref = 0;
-}
-
-int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
-                                 unsigned, struct btree_trans *);
-
-/* First bits for BCH_WATERMARK: */
-enum journal_res_flags {
-       __JOURNAL_RES_GET_NONBLOCK      = BCH_WATERMARK_BITS,
-       __JOURNAL_RES_GET_CHECK,
-};
-
-#define JOURNAL_RES_GET_NONBLOCK       (1 << __JOURNAL_RES_GET_NONBLOCK)
-#define JOURNAL_RES_GET_CHECK          (1 << __JOURNAL_RES_GET_CHECK)
-
-static inline int journal_res_get_fast(struct journal *j,
-                                      struct journal_res *res,
-                                      unsigned flags)
-{
-       union journal_res_state old, new;
-
-       old.v = atomic64_read(&j->reservations.counter);
-       do {
-               new.v = old.v;
-
-               /*
-                * Check if there is still room in the current journal
-                * entry, smp_rmb() guarantees that reads from reservations.counter
-                * occur before accessing cur_entry_u64s:
-                */
-               smp_rmb();
-               if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
-                       return 0;
-
-               EBUG_ON(!journal_state_count(new, new.idx));
-
-               if ((flags & BCH_WATERMARK_MASK) < j->watermark)
-                       return 0;
-
-               new.cur_entry_offset += res->u64s;
-               journal_state_inc(&new);
-
-               /*
-                * If the refcount would overflow, we have to wait:
-                * XXX - tracepoint this:
-                */
-               if (!journal_state_count(new, new.idx))
-                       return 0;
-
-               if (flags & JOURNAL_RES_GET_CHECK)
-                       return 1;
-       } while (!atomic64_try_cmpxchg(&j->reservations.counter,
-                                      &old.v, new.v));
-
-       res->ref        = true;
-       res->offset     = old.cur_entry_offset;
-       res->seq        = journal_cur_seq(j);
-       res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK;
-       return 1;
-}
-
-static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
-                                      unsigned u64s, unsigned flags,
-                                      struct btree_trans *trans)
-{
-       int ret;
-
-       EBUG_ON(res->ref);
-       EBUG_ON(!test_bit(JOURNAL_running, &j->flags));
-
-       res->u64s = u64s;
-
-       if (journal_res_get_fast(j, res, flags))
-               goto out;
-
-       ret = bch2_journal_res_get_slowpath(j, res, flags, trans);
-       if (ret)
-               return ret;
-out:
-       if (!(flags & JOURNAL_RES_GET_CHECK)) {
-               lock_acquire_shared(&j->res_map, 0,
-                                   (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
-                                   NULL, _THIS_IP_);
-               EBUG_ON(!res->ref);
-               BUG_ON(!res->seq);
-       }
-       return 0;
-}
-
-/* journal_entry_res: */
-
-void bch2_journal_entry_res_resize(struct journal *,
-                                  struct journal_entry_res *,
-                                  unsigned);
-
-int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
-void bch2_journal_flush_async(struct journal *, struct closure *);
-
-int bch2_journal_flush_seq(struct journal *, u64, unsigned);
-int bch2_journal_flush(struct journal *);
-bool bch2_journal_noflush_seq(struct journal *, u64, u64);
-int bch2_journal_meta(struct journal *);
-
-void bch2_journal_halt_locked(struct journal *);
-void bch2_journal_halt(struct journal *);
-
-static inline int bch2_journal_error(struct journal *j)
-{
-       return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
-               ? -BCH_ERR_journal_shutdown : 0;
-}
-
-struct bch_dev;
-
-void bch2_journal_unblock(struct journal *);
-void bch2_journal_block(struct journal *);
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
-
-void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-
-int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned);
-int bch2_dev_journal_bucket_delete(struct bch_dev *, u64);
-
-int bch2_dev_journal_alloc(struct bch_dev *, bool);
-int bch2_fs_journal_alloc(struct bch_fs *);
-
-void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
-
-void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, u64);
-void bch2_journal_set_replay_done(struct journal *);
-
-void bch2_dev_journal_exit(struct bch_dev *);
-int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
-void bch2_fs_journal_exit(struct journal *);
-void bch2_fs_journal_init_early(struct journal *);
-int bch2_fs_journal_init(struct journal *);
-
-#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
deleted file mode 100644 (file)
index 29bea8e..0000000
+++ /dev/null
@@ -1,2242 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_io.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-#include <linux/string_choices.h>
-#include <linux/sched/sysctl.h>
-
-void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
-{
-       lockdep_assert_held(&c->sb_lock);
-
-       for_each_member_device(c, ca) {
-               struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
-               m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx);
-               m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free);
-       }
-}
-
-void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
-{
-       mutex_lock(&c->sb_lock);
-       for_each_member_device(c, ca) {
-               struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
-
-               unsigned idx = le32_to_cpu(m.last_journal_bucket);
-               if (idx < ca->journal.nr)
-                       ca->journal.cur_idx = idx;
-               unsigned offset = le32_to_cpu(m.last_journal_bucket_offset);
-               if (offset <= ca->mi.bucket_size)
-                       ca->journal.sectors_free = ca->mi.bucket_size - offset;
-       }
-       mutex_unlock(&c->sb_lock);
-}
-
-static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p)
-{
-       struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev);
-       prt_printf(out, "%s %u:%u:%u (sector %llu)",
-                  ca ? ca->name : "(invalid dev)",
-                  p->dev, p->bucket, p->bucket_offset, p->sector);
-       bch2_dev_put(ca);
-}
-
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j)
-{
-       darray_for_each(j->ptrs, i) {
-               if (i != j->ptrs.data)
-                       prt_printf(out, " ");
-               bch2_journal_ptr_to_text(out, c, i);
-       }
-}
-
-static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j)
-{
-       for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) {
-               struct jset_entry_datetime *datetime =
-                       container_of(entry, struct jset_entry_datetime, entry);
-               bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
-               break;
-       }
-}
-
-static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
-                                       struct journal_replay *j)
-{
-       prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
-       bch2_journal_datetime_to_text(out, &j->j);
-       prt_char(out, ' ');
-       bch2_journal_ptrs_to_text(out, c, j);
-}
-
-static struct nonce journal_nonce(const struct jset *jset)
-{
-       return (struct nonce) {{
-               [0] = 0,
-               [1] = ((__le32 *) &jset->seq)[0],
-               [2] = ((__le32 *) &jset->seq)[1],
-               [3] = BCH_NONCE_JOURNAL,
-       }};
-}
-
-static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
-{
-       if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
-               *csum = (struct bch_csum) {};
-               return false;
-       }
-
-       *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
-       return !bch2_crc_cmp(j->csum, *csum);
-}
-
-static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
-{
-       return (seq - c->journal_entries_base_seq) & (~0U >> 1);
-}
-
-static void __journal_replay_free(struct bch_fs *c,
-                                 struct journal_replay *i)
-{
-       struct journal_replay **p =
-               genradix_ptr(&c->journal_entries,
-                            journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
-
-       BUG_ON(*p != i);
-       *p = NULL;
-       kvfree(i);
-}
-
-static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
-{
-       if (blacklisted)
-               i->ignore_blacklisted = true;
-       else
-               i->ignore_not_dirty = true;
-
-       if (!c->opts.read_entire_journal)
-               __journal_replay_free(c, i);
-}
-
-struct journal_list {
-       struct closure          cl;
-       u64                     last_seq;
-       struct mutex            lock;
-       int                     ret;
-};
-
-#define JOURNAL_ENTRY_ADD_OK           0
-#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
-
-/*
- * Given a journal entry we just read, add it to the list of journal entries to
- * be replayed:
- */
-static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
-                            struct journal_ptr entry_ptr,
-                            struct journal_list *jlist, struct jset *j)
-{
-       struct genradix_iter iter;
-       struct journal_replay **_i, *i, *dup;
-       size_t bytes = vstruct_bytes(j);
-       u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
-       struct printbuf buf = PRINTBUF;
-       int ret = JOURNAL_ENTRY_ADD_OK;
-
-       if (last_seq && c->opts.journal_rewind)
-               last_seq = min(last_seq, c->opts.journal_rewind);
-
-       if (!c->journal.oldest_seq_found_ondisk ||
-           le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
-               c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
-
-       /* Is this entry older than the range we need? */
-       if (!c->opts.read_entire_journal &&
-           le64_to_cpu(j->seq) < jlist->last_seq)
-               return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-
-       /*
-        * genradixes are indexed by a ulong, not a u64, so we can't index them
-        * by sequence number directly: Assume instead that they will all fall
-        * within the range of +-2billion of the filrst one we find.
-        */
-       if (!c->journal_entries_base_seq)
-               c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
-
-       /* Drop entries we don't need anymore */
-       if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
-               genradix_for_each_from(&c->journal_entries, iter, _i,
-                                      journal_entry_radix_idx(c, jlist->last_seq)) {
-                       i = *_i;
-
-                       if (journal_replay_ignore(i))
-                               continue;
-
-                       if (le64_to_cpu(i->j.seq) >= last_seq)
-                               break;
-
-                       journal_replay_free(c, i, false);
-               }
-       }
-
-       jlist->last_seq = max(jlist->last_seq, last_seq);
-
-       _i = genradix_ptr_alloc(&c->journal_entries,
-                               journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
-                               GFP_KERNEL);
-       if (!_i)
-               return bch_err_throw(c, ENOMEM_journal_entry_add);
-
-       /*
-        * Duplicate journal entries? If so we want the one that didn't have a
-        * checksum error:
-        */
-       dup = *_i;
-       if (dup) {
-               bool identical = bytes == vstruct_bytes(&dup->j) &&
-                       !memcmp(j, &dup->j, bytes);
-               bool not_identical = !identical &&
-                       entry_ptr.csum_good &&
-                       dup->csum_good;
-
-               bool same_device = false;
-               darray_for_each(dup->ptrs, ptr)
-                       if (ptr->dev == ca->dev_idx)
-                               same_device = true;
-
-               ret = darray_push(&dup->ptrs, entry_ptr);
-               if (ret)
-                       goto out;
-
-               bch2_journal_replay_to_text(&buf, c, dup);
-
-               fsck_err_on(same_device,
-                           c, journal_entry_dup_same_device,
-                           "duplicate journal entry on same device\n%s",
-                           buf.buf);
-
-               fsck_err_on(not_identical,
-                           c, journal_entry_replicas_data_mismatch,
-                           "found duplicate but non identical journal entries\n%s",
-                           buf.buf);
-
-               if (entry_ptr.csum_good && !identical)
-                       goto replace;
-
-               goto out;
-       }
-replace:
-       i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
-       if (!i)
-               return bch_err_throw(c, ENOMEM_journal_entry_add);
-
-       darray_init(&i->ptrs);
-       i->csum_good            = entry_ptr.csum_good;
-       i->ignore_blacklisted   = false;
-       i->ignore_not_dirty     = false;
-       unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
-
-       if (dup) {
-               /* The first ptr should represent the jset we kept: */
-               darray_for_each(dup->ptrs, ptr)
-                       darray_push(&i->ptrs, *ptr);
-               __journal_replay_free(c, dup);
-       } else {
-               darray_push(&i->ptrs, entry_ptr);
-       }
-
-       *_i = i;
-out:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-/* this fills in a range with empty jset_entries: */
-static void journal_entry_null_range(void *start, void *end)
-{
-       struct jset_entry *entry;
-
-       for (entry = start; entry != end; entry = vstruct_next(entry))
-               memset(entry, 0, sizeof(*entry));
-}
-
-#define JOURNAL_ENTRY_REREAD   5
-#define JOURNAL_ENTRY_NONE     6
-#define JOURNAL_ENTRY_BAD      7
-
-static void journal_entry_err_msg(struct printbuf *out,
-                                 u32 version,
-                                 struct jset *jset,
-                                 struct jset_entry *entry)
-{
-       prt_str(out, "invalid journal entry, version=");
-       bch2_version_to_text(out, version);
-
-       if (entry) {
-               prt_str(out, " type=");
-               bch2_prt_jset_entry_type(out, entry->type);
-       }
-
-       if (!jset) {
-               prt_printf(out, " in superblock");
-       } else {
-
-               prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
-
-               if (entry)
-                       prt_printf(out, " offset=%zi/%u",
-                                  (u64 *) entry - jset->_data,
-                                  le32_to_cpu(jset->u64s));
-       }
-
-       prt_str(out, ": ");
-}
-
-#define journal_entry_err(c, version, jset, entry, _err, msg, ...)     \
-({                                                                     \
-       struct printbuf _buf = PRINTBUF;                                \
-                                                                       \
-       journal_entry_err_msg(&_buf, version, jset, entry);             \
-       prt_printf(&_buf, msg, ##__VA_ARGS__);                          \
-                                                                       \
-       switch (from.flags & BCH_VALIDATE_write) {                      \
-       case READ:                                                      \
-               mustfix_fsck_err(c, _err, "%s", _buf.buf);              \
-               break;                                                  \
-       case WRITE:                                                     \
-               bch2_sb_error_count(c, BCH_FSCK_ERR_##_err);            \
-               if (bch2_fs_inconsistent(c,                             \
-                               "corrupt metadata before write: %s\n", _buf.buf)) {\
-                       ret = bch_err_throw(c, fsck_errors_not_fixed);          \
-                       goto fsck_err;                                  \
-               }                                                       \
-               break;                                                  \
-       }                                                               \
-                                                                       \
-       printbuf_exit(&_buf);                                           \
-       true;                                                           \
-})
-
-#define journal_entry_err_on(cond, ...)                                        \
-       ((cond) ? journal_entry_err(__VA_ARGS__) : false)
-
-#define FSCK_DELETED_KEY       5
-
-static int journal_validate_key(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               struct bkey_i *k,
-                               struct bkey_validate_context from,
-                               unsigned version, int big_endian)
-{
-       enum bch_validate_flags flags = from.flags;
-       int write = flags & BCH_VALIDATE_write;
-       void *next = vstruct_next(entry);
-       int ret = 0;
-
-       if (journal_entry_err_on(!k->k.u64s,
-                                c, version, jset, entry,
-                                journal_entry_bkey_u64s_0,
-                                "k->u64s 0")) {
-               entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
-               journal_entry_null_range(vstruct_next(entry), next);
-               return FSCK_DELETED_KEY;
-       }
-
-       if (journal_entry_err_on((void *) bkey_next(k) >
-                                (void *) vstruct_next(entry),
-                                c, version, jset, entry,
-                                journal_entry_bkey_past_end,
-                                "extends past end of journal entry")) {
-               entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
-               journal_entry_null_range(vstruct_next(entry), next);
-               return FSCK_DELETED_KEY;
-       }
-
-       if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
-                                c, version, jset, entry,
-                                journal_entry_bkey_bad_format,
-                                "bad format %u", k->k.format)) {
-               le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
-               memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-               journal_entry_null_range(vstruct_next(entry), next);
-               return FSCK_DELETED_KEY;
-       }
-
-       if (!write)
-               bch2_bkey_compat(from.level, from.btree, version, big_endian,
-                                write, NULL, bkey_to_packed(k));
-
-       ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from);
-       if (ret == -BCH_ERR_fsck_delete_bkey) {
-               le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
-               memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-               journal_entry_null_range(vstruct_next(entry), next);
-               return FSCK_DELETED_KEY;
-       }
-       if (ret)
-               goto fsck_err;
-
-       if (write)
-               bch2_bkey_compat(from.level, from.btree, version, big_endian,
-                                write, NULL, bkey_to_packed(k));
-fsck_err:
-       return ret;
-}
-
-static int journal_entry_btree_keys_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       struct bkey_i *k = entry->start;
-
-       from.level      = entry->level;
-       from.btree      = entry->btree_id;
-
-       while (k != vstruct_last(entry)) {
-               int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
-               if (ret == FSCK_DELETED_KEY)
-                       continue;
-               else if (ret)
-                       return ret;
-
-               k = bkey_next(k);
-       }
-
-       return 0;
-}
-
-static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
-                                            struct jset_entry *entry)
-{
-       bool first = true;
-
-       jset_entry_for_each_key(entry, k) {
-               /* We may be called on entries that haven't been validated: */
-               if (!k->k.u64s)
-                       break;
-
-               if (!first) {
-                       prt_newline(out);
-                       bch2_prt_jset_entry_type(out, entry->type);
-                       prt_str(out, ": ");
-               }
-               bch2_btree_id_level_to_text(out, entry->btree_id, entry->level);
-               prt_char(out, ' ');
-               bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
-               first = false;
-       }
-}
-
-static int journal_entry_btree_root_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       struct bkey_i *k = entry->start;
-       int ret = 0;
-
-       from.root       = true;
-       from.level      = entry->level + 1;
-       from.btree      = entry->btree_id;
-
-       if (journal_entry_err_on(!entry->u64s ||
-                                le16_to_cpu(entry->u64s) != k->k.u64s,
-                                c, version, jset, entry,
-                                journal_entry_btree_root_bad_size,
-                                "invalid btree root journal entry: wrong number of keys")) {
-               void *next = vstruct_next(entry);
-               /*
-                * we don't want to null out this jset_entry,
-                * just the contents, so that later we can tell
-                * we were _supposed_ to have a btree root
-                */
-               entry->u64s = 0;
-               journal_entry_null_range(vstruct_next(entry), next);
-               return 0;
-       }
-
-       ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
-       if (ret == FSCK_DELETED_KEY)
-               ret = 0;
-fsck_err:
-       return ret;
-}
-
-static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
-                                            struct jset_entry *entry)
-{
-       journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       /* obsolete, don't care: */
-       return 0;
-}
-
-static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-                                           struct jset_entry *entry)
-{
-}
-
-static int journal_entry_blacklist_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
-                                c, version, jset, entry,
-                                journal_entry_blacklist_bad_size,
-               "invalid journal seq blacklist entry: bad size")) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-       }
-fsck_err:
-       return ret;
-}
-
-static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
-                                           struct jset_entry *entry)
-{
-       struct jset_entry_blacklist *bl =
-               container_of(entry, struct jset_entry_blacklist, entry);
-
-       prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
-}
-
-static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       struct jset_entry_blacklist_v2 *bl_entry;
-       int ret = 0;
-
-       if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
-                                c, version, jset, entry,
-                                journal_entry_blacklist_v2_bad_size,
-               "invalid journal seq blacklist entry: bad size")) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-               goto out;
-       }
-
-       bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
-
-       if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
-                                le64_to_cpu(bl_entry->end),
-                                c, version, jset, entry,
-                                journal_entry_blacklist_v2_start_past_end,
-               "invalid journal seq blacklist entry: start > end")) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-       }
-out:
-fsck_err:
-       return ret;
-}
-
-static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
-                                              struct jset_entry *entry)
-{
-       struct jset_entry_blacklist_v2 *bl =
-               container_of(entry, struct jset_entry_blacklist_v2, entry);
-
-       prt_printf(out, "start=%llu end=%llu",
-              le64_to_cpu(bl->start),
-              le64_to_cpu(bl->end));
-}
-
-static int journal_entry_usage_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       struct jset_entry_usage *u =
-               container_of(entry, struct jset_entry_usage, entry);
-       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-       int ret = 0;
-
-       if (journal_entry_err_on(bytes < sizeof(*u),
-                                c, version, jset, entry,
-                                journal_entry_usage_bad_size,
-                                "invalid journal entry usage: bad size")) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-               return ret;
-       }
-
-fsck_err:
-       return ret;
-}
-
-static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
-                                       struct jset_entry *entry)
-{
-       struct jset_entry_usage *u =
-               container_of(entry, struct jset_entry_usage, entry);
-
-       prt_str(out, "type=");
-       bch2_prt_fs_usage_type(out, u->entry.btree_id);
-       prt_printf(out, " v=%llu", le64_to_cpu(u->v));
-}
-
-static int journal_entry_data_usage_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       struct jset_entry_data_usage *u =
-               container_of(entry, struct jset_entry_data_usage, entry);
-       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-       struct printbuf err = PRINTBUF;
-       int ret = 0;
-
-       if (journal_entry_err_on(bytes < sizeof(*u) ||
-                                bytes < sizeof(*u) + u->r.nr_devs,
-                                c, version, jset, entry,
-                                journal_entry_data_usage_bad_size,
-                                "invalid journal entry usage: bad size")) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-               goto out;
-       }
-
-       if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
-                                c, version, jset, entry,
-                                journal_entry_data_usage_bad_size,
-                                "invalid journal entry usage: %s", err.buf)) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-               goto out;
-       }
-out:
-fsck_err:
-       printbuf_exit(&err);
-       return ret;
-}
-
-static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
-                                            struct jset_entry *entry)
-{
-       struct jset_entry_data_usage *u =
-               container_of(entry, struct jset_entry_data_usage, entry);
-
-       bch2_replicas_entry_to_text(out, &u->r);
-       prt_printf(out, "=%llu", le64_to_cpu(u->v));
-}
-
-static int journal_entry_clock_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       struct jset_entry_clock *clock =
-               container_of(entry, struct jset_entry_clock, entry);
-       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-       int ret = 0;
-
-       if (journal_entry_err_on(bytes != sizeof(*clock),
-                                c, version, jset, entry,
-                                journal_entry_clock_bad_size,
-                                "bad size")) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-               return ret;
-       }
-
-       if (journal_entry_err_on(clock->rw > 1,
-                                c, version, jset, entry,
-                                journal_entry_clock_bad_rw,
-                                "bad rw")) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-               return ret;
-       }
-
-fsck_err:
-       return ret;
-}
-
-static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
-                                       struct jset_entry *entry)
-{
-       struct jset_entry_clock *clock =
-               container_of(entry, struct jset_entry_clock, entry);
-
-       prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time));
-}
-
-static int journal_entry_dev_usage_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       struct jset_entry_dev_usage *u =
-               container_of(entry, struct jset_entry_dev_usage, entry);
-       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-       unsigned expected = sizeof(*u);
-       int ret = 0;
-
-       if (journal_entry_err_on(bytes < expected,
-                                c, version, jset, entry,
-                                journal_entry_dev_usage_bad_size,
-                                "bad size (%u < %u)",
-                                bytes, expected)) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-               return ret;
-       }
-
-       if (journal_entry_err_on(u->pad,
-                                c, version, jset, entry,
-                                journal_entry_dev_usage_bad_pad,
-                                "bad pad")) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-               return ret;
-       }
-
-fsck_err:
-       return ret;
-}
-
-static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
-                                           struct jset_entry *entry)
-{
-       struct jset_entry_dev_usage *u =
-               container_of(entry, struct jset_entry_dev_usage, entry);
-       unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
-
-       if (vstruct_bytes(entry) < sizeof(*u))
-               return;
-
-       prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
-
-       printbuf_indent_add(out, 2);
-       for (i = 0; i < nr_types; i++) {
-               prt_newline(out);
-               bch2_prt_data_type(out, i);
-               prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
-                      le64_to_cpu(u->d[i].buckets),
-                      le64_to_cpu(u->d[i].sectors),
-                      le64_to_cpu(u->d[i].fragmented));
-       }
-       printbuf_indent_sub(out, 2);
-}
-
-static int journal_entry_log_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       return 0;
-}
-
-static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
-                                     struct jset_entry *entry)
-{
-       struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
-
-       prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d);
-}
-
-static int journal_entry_overwrite_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       from.flags = 0;
-       return journal_entry_btree_keys_validate(c, jset, entry,
-                               version, big_endian, from);
-}
-
-static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
-                                           struct jset_entry *entry)
-{
-       journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_log_bkey_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       from.flags = 0;
-       return journal_entry_btree_keys_validate(c, jset, entry,
-                               version, big_endian, from);
-}
-
-static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c,
-                                          struct jset_entry *entry)
-{
-       journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       return journal_entry_btree_keys_validate(c, jset, entry,
-                               version, big_endian, from);
-}
-
-static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
-                                           struct jset_entry *entry)
-{
-       journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_datetime_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       unsigned bytes = vstruct_bytes(entry);
-       unsigned expected = 16;
-       int ret = 0;
-
-       if (journal_entry_err_on(vstruct_bytes(entry) < expected,
-                                c, version, jset, entry,
-                                journal_entry_dev_usage_bad_size,
-                                "bad size (%u < %u)",
-                                bytes, expected)) {
-               journal_entry_null_range(entry, vstruct_next(entry));
-               return ret;
-       }
-fsck_err:
-       return ret;
-}
-
-static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
-                                           struct jset_entry *entry)
-{
-       struct jset_entry_datetime *datetime =
-               container_of(entry, struct jset_entry_datetime, entry);
-
-       bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
-}
-
-struct jset_entry_ops {
-       int (*validate)(struct bch_fs *, struct jset *,
-                       struct jset_entry *, unsigned, int,
-                       struct bkey_validate_context);
-       void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
-};
-
-static const struct jset_entry_ops bch2_jset_entry_ops[] = {
-#define x(f, nr)                                               \
-       [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
-               .validate       = journal_entry_##f##_validate, \
-               .to_text        = journal_entry_##f##_to_text,  \
-       },
-       BCH_JSET_ENTRY_TYPES()
-#undef x
-};
-
-int bch2_journal_entry_validate(struct bch_fs *c,
-                               struct jset *jset,
-                               struct jset_entry *entry,
-                               unsigned version, int big_endian,
-                               struct bkey_validate_context from)
-{
-       return entry->type < BCH_JSET_ENTRY_NR
-               ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
-                               version, big_endian, from)
-               : 0;
-}
-
-void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
-                               struct jset_entry *entry)
-{
-       bch2_prt_jset_entry_type(out, entry->type);
-
-       if (entry->type < BCH_JSET_ENTRY_NR) {
-               prt_str(out, ": ");
-               bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
-       }
-}
-
-static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
-                                enum bch_validate_flags flags)
-{
-       struct bkey_validate_context from = {
-               .flags          = flags,
-               .from           = BKEY_VALIDATE_journal,
-               .journal_seq    = le64_to_cpu(jset->seq),
-       };
-
-       unsigned version = le32_to_cpu(jset->version);
-       int ret = 0;
-
-       vstruct_for_each(jset, entry) {
-               from.journal_offset = (u64 *) entry - jset->_data;
-
-               if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
-                               c, version, jset, entry,
-                               journal_entry_past_jset_end,
-                               "journal entry extends past end of jset")) {
-                       jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
-                       break;
-               }
-
-               ret = bch2_journal_entry_validate(c, jset, entry, version,
-                                                 JSET_BIG_ENDIAN(jset), from);
-               if (ret)
-                       break;
-       }
-fsck_err:
-       return ret;
-}
-
-static int jset_validate(struct bch_fs *c,
-                        struct bch_dev *ca,
-                        struct jset *jset, u64 sector,
-                        enum bch_validate_flags flags)
-{
-       struct bkey_validate_context from = {
-               .flags          = flags,
-               .from           = BKEY_VALIDATE_journal,
-               .journal_seq    = le64_to_cpu(jset->seq),
-       };
-       int ret = 0;
-
-       if (le64_to_cpu(jset->magic) != jset_magic(c))
-               return JOURNAL_ENTRY_NONE;
-
-       unsigned version = le32_to_cpu(jset->version);
-       if (journal_entry_err_on(!bch2_version_compatible(version),
-                       c, version, jset, NULL,
-                       jset_unsupported_version,
-                       "%s sector %llu seq %llu: incompatible journal entry version %u.%u",
-                       ca ? ca->name : c->name,
-                       sector, le64_to_cpu(jset->seq),
-                       BCH_VERSION_MAJOR(version),
-                       BCH_VERSION_MINOR(version))) {
-               /* don't try to continue: */
-               return -EINVAL;
-       }
-
-       if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
-                       c, version, jset, NULL,
-                       jset_unknown_csum,
-                       "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
-                       ca ? ca->name : c->name,
-                       sector, le64_to_cpu(jset->seq),
-                       JSET_CSUM_TYPE(jset)))
-               ret = JOURNAL_ENTRY_BAD;
-
-       /* last_seq is ignored when JSET_NO_FLUSH is true */
-       if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
-                                le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
-                                c, version, jset, NULL,
-                                jset_last_seq_newer_than_seq,
-                                "invalid journal entry: last_seq > seq (%llu > %llu)",
-                                le64_to_cpu(jset->last_seq),
-                                le64_to_cpu(jset->seq))) {
-               jset->last_seq = jset->seq;
-               return JOURNAL_ENTRY_BAD;
-       }
-
-       ret = jset_validate_entries(c, jset, flags);
-fsck_err:
-       return ret;
-}
-
-static int jset_validate_early(struct bch_fs *c,
-                        struct bch_dev *ca,
-                        struct jset *jset, u64 sector,
-                        unsigned bucket_sectors_left,
-                        unsigned sectors_read)
-{
-       struct bkey_validate_context from = {
-               .from           = BKEY_VALIDATE_journal,
-               .journal_seq    = le64_to_cpu(jset->seq),
-       };
-       int ret = 0;
-
-       if (le64_to_cpu(jset->magic) != jset_magic(c))
-               return JOURNAL_ENTRY_NONE;
-
-       unsigned version = le32_to_cpu(jset->version);
-       if (journal_entry_err_on(!bch2_version_compatible(version),
-                       c, version, jset, NULL,
-                       jset_unsupported_version,
-                       "%s sector %llu seq %llu: unknown journal entry version %u.%u",
-                       ca ? ca->name : c->name,
-                       sector, le64_to_cpu(jset->seq),
-                       BCH_VERSION_MAJOR(version),
-                       BCH_VERSION_MINOR(version))) {
-               /* don't try to continue: */
-               return -EINVAL;
-       }
-
-       size_t bytes = vstruct_bytes(jset);
-       if (bytes > (sectors_read << 9) &&
-           sectors_read < bucket_sectors_left)
-               return JOURNAL_ENTRY_REREAD;
-
-       if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
-                       c, version, jset, NULL,
-                       jset_past_bucket_end,
-                       "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
-                       ca ? ca->name : c->name,
-                       sector, le64_to_cpu(jset->seq), bytes))
-               le32_add_cpu(&jset->u64s,
-                            -((bytes - (bucket_sectors_left << 9)) / 8));
-fsck_err:
-       return ret;
-}
-
-struct journal_read_buf {
-       void            *data;
-       size_t          size;
-};
-
-static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b,
-                                   size_t new_size)
-{
-       void *n;
-
-       /* the bios are sized for this many pages, max: */
-       if (new_size > JOURNAL_ENTRY_SIZE_MAX)
-               return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
-
-       new_size = roundup_pow_of_two(new_size);
-       n = kvmalloc(new_size, GFP_KERNEL);
-       if (!n)
-               return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
-
-       kvfree(b->data);
-       b->data = n;
-       b->size = new_size;
-       return 0;
-}
-
-static int journal_read_bucket(struct bch_dev *ca,
-                              struct journal_read_buf *buf,
-                              struct journal_list *jlist,
-                              unsigned bucket)
-{
-       struct bch_fs *c = ca->fs;
-       struct journal_device *ja = &ca->journal;
-       struct jset *j = NULL;
-       unsigned sectors, sectors_read = 0;
-       u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
-           end = offset + ca->mi.bucket_size;
-       bool saw_bad = false, csum_good;
-       int ret = 0;
-
-       pr_debug("reading %u", bucket);
-
-       while (offset < end) {
-               if (!sectors_read) {
-                       struct bio *bio;
-                       unsigned nr_bvecs;
-reread:
-                       sectors_read = min_t(unsigned,
-                               end - offset, buf->size >> 9);
-                       nr_bvecs = buf_pages(buf->data, sectors_read << 9);
-
-                       bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
-                       if (!bio)
-                               return bch_err_throw(c, ENOMEM_journal_read_bucket);
-                       bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
-
-                       bio->bi_iter.bi_sector = offset;
-                       bch2_bio_map(bio, buf->data, sectors_read << 9);
-
-                       u64 submit_time = local_clock();
-                       ret = submit_bio_wait(bio);
-                       kfree(bio);
-
-                       if (!ret && bch2_meta_read_fault("journal"))
-                               ret = bch_err_throw(c, EIO_fault_injected);
-
-                       bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
-                                                  submit_time, !ret);
-
-                       if (ret) {
-                               bch_err_dev_ratelimited(ca,
-                                       "journal read error: sector %llu", offset);
-                               /*
-                                * We don't error out of the recovery process
-                                * here, since the relevant journal entry may be
-                                * found on a different device, and missing or
-                                * no journal entries will be handled later
-                                */
-                               return 0;
-                       }
-
-                       j = buf->data;
-               }
-
-               ret = jset_validate_early(c, ca, j, offset,
-                                   end - offset, sectors_read);
-               switch (ret) {
-               case 0:
-                       sectors = vstruct_sectors(j, c->block_bits);
-                       break;
-               case JOURNAL_ENTRY_REREAD:
-                       if (vstruct_bytes(j) > buf->size) {
-                               ret = journal_read_buf_realloc(c, buf,
-                                                       vstruct_bytes(j));
-                               if (ret)
-                                       return ret;
-                       }
-                       goto reread;
-               case JOURNAL_ENTRY_NONE:
-                       if (!saw_bad)
-                               return 0;
-                       /*
-                        * On checksum error we don't really trust the size
-                        * field of the journal entry we read, so try reading
-                        * again at next block boundary:
-                        */
-                       sectors = block_sectors(c);
-                       goto next_block;
-               default:
-                       return ret;
-               }
-
-               if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
-                       ja->highest_seq_found = le64_to_cpu(j->seq);
-                       ja->cur_idx = bucket;
-                       ja->sectors_free = ca->mi.bucket_size -
-                               bucket_remainder(ca, offset) - sectors;
-               }
-
-               /*
-                * This happens sometimes if we don't have discards on -
-                * when we've partially overwritten a bucket with new
-                * journal entries. We don't need the rest of the
-                * bucket:
-                */
-               if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
-                       return 0;
-
-               ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
-               struct bch_csum csum;
-               csum_good = jset_csum_good(c, j, &csum);
-
-               bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
-
-               if (!csum_good) {
-                       /*
-                        * Don't print an error here, we'll print the error
-                        * later if we need this journal entry
-                        */
-                       saw_bad = true;
-               }
-
-               ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
-                            j->encrypted_start,
-                            vstruct_end(j) - (void *) j->encrypted_start);
-               bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
-
-               mutex_lock(&jlist->lock);
-               ret = journal_entry_add(c, ca, (struct journal_ptr) {
-                                       .csum_good      = csum_good,
-                                       .csum           = csum,
-                                       .dev            = ca->dev_idx,
-                                       .bucket         = bucket,
-                                       .bucket_offset  = offset -
-                                               bucket_to_sector(ca, ja->buckets[bucket]),
-                                       .sector         = offset,
-                                       }, jlist, j);
-               mutex_unlock(&jlist->lock);
-
-               switch (ret) {
-               case JOURNAL_ENTRY_ADD_OK:
-                       break;
-               case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
-                       break;
-               default:
-                       return ret;
-               }
-next_block:
-               pr_debug("next");
-               offset          += sectors;
-               sectors_read    -= sectors;
-               j = ((void *) j) + (sectors << 9);
-       }
-
-       return 0;
-}
-
-static CLOSURE_CALLBACK(bch2_journal_read_device)
-{
-       closure_type(ja, struct journal_device, read);
-       struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
-       struct bch_fs *c = ca->fs;
-       struct journal_list *jlist =
-               container_of(cl->parent, struct journal_list, cl);
-       struct journal_read_buf buf = { NULL, 0 };
-       unsigned i;
-       int ret = 0;
-
-       if (!ja->nr)
-               goto out;
-
-       ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE);
-       if (ret)
-               goto err;
-
-       pr_debug("%u journal buckets", ja->nr);
-
-       for (i = 0; i < ja->nr; i++) {
-               ret = journal_read_bucket(ca, &buf, jlist, i);
-               if (ret)
-                       goto err;
-       }
-
-       /*
-        * Set dirty_idx to indicate the entire journal is full and needs to be
-        * reclaimed - journal reclaim will immediately reclaim whatever isn't
-        * pinned when it first runs:
-        */
-       ja->discard_idx = ja->dirty_idx_ondisk =
-               ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
-out:
-       bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
-       kvfree(buf.data);
-       enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read);
-       closure_return(cl);
-       return;
-err:
-       mutex_lock(&jlist->lock);
-       jlist->ret = ret;
-       mutex_unlock(&jlist->lock);
-       goto out;
-}
-
-noinline_for_stack
-static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
-{
-       struct printbuf buf = PRINTBUF;
-       enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
-       bool have_good = false;
-
-       prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
-       bch2_journal_datetime_to_text(&buf, &j->j);
-       prt_newline(&buf);
-
-       darray_for_each(j->ptrs, ptr)
-               if (!ptr->csum_good) {
-                       bch2_journal_ptr_to_text(&buf, c, ptr);
-                       prt_char(&buf, ' ');
-                       bch2_csum_to_text(&buf, csum_type, ptr->csum);
-                       prt_newline(&buf);
-               } else {
-                       have_good = true;
-               }
-
-       prt_printf(&buf, "should be ");
-       bch2_csum_to_text(&buf, csum_type, j->j.csum);
-
-       if (have_good)
-               prt_printf(&buf, "\n(had good copy on another device)");
-
-       bch2_print_str(c, KERN_ERR, buf.buf);
-       printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
-{
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       struct genradix_iter radix_iter;
-       struct journal_replay *i, **_i, *prev = NULL;
-       u64 seq = start_seq;
-
-       genradix_for_each(&c->journal_entries, radix_iter, _i) {
-               i = *_i;
-
-               if (journal_replay_ignore(i))
-                       continue;
-
-               BUG_ON(seq > le64_to_cpu(i->j.seq));
-
-               while (seq < le64_to_cpu(i->j.seq)) {
-                       while (seq < le64_to_cpu(i->j.seq) &&
-                              bch2_journal_seq_is_blacklisted(c, seq, false))
-                               seq++;
-
-                       if (seq == le64_to_cpu(i->j.seq))
-                               break;
-
-                       u64 missing_start = seq;
-
-                       while (seq < le64_to_cpu(i->j.seq) &&
-                              !bch2_journal_seq_is_blacklisted(c, seq, false))
-                               seq++;
-
-                       u64 missing_end = seq - 1;
-
-                       printbuf_reset(&buf);
-                       prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
-                                  missing_start, missing_end,
-                                  start_seq, end_seq);
-
-                       prt_printf(&buf, "\nprev at ");
-                       if (prev) {
-                               bch2_journal_ptrs_to_text(&buf, c, prev);
-                               prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
-                       } else
-                               prt_printf(&buf, "(none)");
-
-                       prt_printf(&buf, "\nnext at ");
-                       bch2_journal_ptrs_to_text(&buf, c, i);
-                       prt_printf(&buf, ", continue?");
-
-                       fsck_err(c, journal_entries_missing, "%s", buf.buf);
-               }
-
-               prev = i;
-               seq++;
-       }
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_journal_read(struct bch_fs *c,
-                     u64 *last_seq,
-                     u64 *blacklist_seq,
-                     u64 *start_seq)
-{
-       struct journal_list jlist;
-       struct journal_replay *i, **_i;
-       struct genradix_iter radix_iter;
-       struct printbuf buf = PRINTBUF;
-       bool degraded = false, last_write_torn = false;
-       u64 seq;
-       int ret = 0;
-
-       closure_init_stack(&jlist.cl);
-       mutex_init(&jlist.lock);
-       jlist.last_seq = 0;
-       jlist.ret = 0;
-
-       for_each_member_device(c, ca) {
-               if (!c->opts.fsck &&
-                   !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
-                       continue;
-
-               if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
-                    ca->mi.state == BCH_MEMBER_STATE_ro) &&
-                   enumerated_ref_tryget(&ca->io_ref[READ],
-                                         BCH_DEV_READ_REF_journal_read))
-                       closure_call(&ca->journal.read,
-                                    bch2_journal_read_device,
-                                    system_dfl_wq,
-                                    &jlist.cl);
-               else
-                       degraded = true;
-       }
-
-       while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2))
-               ;
-
-       if (jlist.ret)
-               return jlist.ret;
-
-       *last_seq       = 0;
-       *start_seq      = 0;
-       *blacklist_seq  = 0;
-
-       /*
-        * Find most recent flush entry, and ignore newer non flush entries -
-        * those entries will be blacklisted:
-        */
-       genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
-               i = *_i;
-
-               if (journal_replay_ignore(i))
-                       continue;
-
-               if (!*start_seq)
-                       *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
-
-               if (JSET_NO_FLUSH(&i->j)) {
-                       i->ignore_blacklisted = true;
-                       continue;
-               }
-
-               if (!last_write_torn && !i->csum_good) {
-                       last_write_torn = true;
-                       i->ignore_blacklisted = true;
-                       continue;
-               }
-
-               struct bkey_validate_context from = {
-                       .from           = BKEY_VALIDATE_journal,
-                       .journal_seq    = le64_to_cpu(i->j.seq),
-               };
-               if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
-                                        c, le32_to_cpu(i->j.version), &i->j, NULL,
-                                        jset_last_seq_newer_than_seq,
-                                        "invalid journal entry: last_seq > seq (%llu > %llu)",
-                                        le64_to_cpu(i->j.last_seq),
-                                        le64_to_cpu(i->j.seq)))
-                       i->j.last_seq = i->j.seq;
-
-               *last_seq       = le64_to_cpu(i->j.last_seq);
-               *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
-               break;
-       }
-
-       if (!*start_seq) {
-               bch_info(c, "journal read done, but no entries found");
-               return 0;
-       }
-
-       if (!*last_seq) {
-               fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
-                        "journal read done, but no entries found after dropping non-flushes");
-               return 0;
-       }
-
-       printbuf_reset(&buf);
-       prt_printf(&buf, "journal read done, replaying entries %llu-%llu",
-                  *last_seq, *blacklist_seq - 1);
-
-       /*
-        * Drop blacklisted entries and entries older than last_seq (or start of
-        * journal rewind:
-        */
-       u64 drop_before = *last_seq;
-       if (c->opts.journal_rewind) {
-               drop_before = min(drop_before, c->opts.journal_rewind);
-               prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind);
-       }
-
-       *last_seq = drop_before;
-       if (*start_seq != *blacklist_seq)
-               prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1);
-       bch_info(c, "%s", buf.buf);
-       genradix_for_each(&c->journal_entries, radix_iter, _i) {
-               i = *_i;
-
-               if (journal_replay_ignore(i))
-                       continue;
-
-               seq = le64_to_cpu(i->j.seq);
-               if (seq < drop_before) {
-                       journal_replay_free(c, i, false);
-                       continue;
-               }
-
-               if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
-                       fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
-                                   jset_seq_blacklisted,
-                                   "found blacklisted journal entry %llu", seq);
-                       i->ignore_blacklisted = true;
-               }
-       }
-
-       ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1);
-       if (ret)
-               goto err;
-
-       genradix_for_each(&c->journal_entries, radix_iter, _i) {
-               union bch_replicas_padded replicas = {
-                       .e.data_type = BCH_DATA_journal,
-                       .e.nr_devs = 0,
-                       .e.nr_required = 1,
-               };
-
-               i = *_i;
-               if (journal_replay_ignore(i))
-                       continue;
-
-               /*
-                * Don't print checksum errors until we know we're going to use
-                * a given journal entry:
-                */
-               darray_for_each(i->ptrs, ptr)
-                       if (!ptr->csum_good) {
-                               bch2_journal_print_checksum_error(c, i);
-                               break;
-                       }
-
-               ret = jset_validate(c,
-                                   bch2_dev_have_ref(c, i->ptrs.data[0].dev),
-                                   &i->j,
-                                   i->ptrs.data[0].sector,
-                                   READ);
-               if (ret)
-                       goto err;
-
-               darray_for_each(i->ptrs, ptr)
-                       replicas_entry_add_dev(&replicas.e, ptr->dev);
-
-               bch2_replicas_entry_sort(&replicas.e);
-
-               printbuf_reset(&buf);
-               bch2_replicas_entry_to_text(&buf, &replicas.e);
-
-               if (!degraded &&
-                   !bch2_replicas_marked(c, &replicas.e) &&
-                   (le64_to_cpu(i->j.seq) == *last_seq ||
-                    fsck_err(c, journal_entry_replicas_not_marked,
-                             "superblock not marked as containing replicas for journal entry %llu\n%s",
-                             le64_to_cpu(i->j.seq), buf.buf))) {
-                       ret = bch2_mark_replicas(c, &replicas.e);
-                       if (ret)
-                               goto err;
-               }
-       }
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-/* journal write: */
-
-static void journal_advance_devs_to_next_bucket(struct journal *j,
-                                               struct dev_alloc_list *devs,
-                                               unsigned sectors, __le64 seq)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-       guard(rcu)();
-       darray_for_each(*devs, i) {
-               struct bch_dev *ca = rcu_dereference(c->devs[*i]);
-               if (!ca)
-                       continue;
-
-               struct journal_device *ja = &ca->journal;
-
-               if (sectors > ja->sectors_free &&
-                   sectors <= ca->mi.bucket_size &&
-                   bch2_journal_dev_buckets_available(j, ja,
-                                       journal_space_discarded)) {
-                       ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-                       ja->sectors_free = ca->mi.bucket_size;
-
-                       /*
-                        * ja->bucket_seq[ja->cur_idx] must always have
-                        * something sensible:
-                        */
-                       ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
-               }
-       }
-}
-
-static void __journal_write_alloc(struct journal *j,
-                                 struct journal_buf *w,
-                                 struct dev_alloc_list *devs,
-                                 unsigned sectors,
-                                 unsigned *replicas,
-                                 unsigned replicas_want)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-       darray_for_each(*devs, i) {
-               struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE,
-                                       BCH_DEV_WRITE_REF_journal_write);
-               if (!ca)
-                       continue;
-
-               struct journal_device *ja = &ca->journal;
-
-               /*
-                * Check that we can use this device, and aren't already using
-                * it:
-                */
-               if (!ca->mi.durability ||
-                   ca->mi.state != BCH_MEMBER_STATE_rw ||
-                   !ja->nr ||
-                   bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
-                   sectors > ja->sectors_free) {
-                       enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
-                       continue;
-               }
-
-               bch2_dev_stripe_increment(ca, &j->wp.stripe);
-
-               bch2_bkey_append_ptr(&w->key,
-                       (struct bch_extent_ptr) {
-                                 .offset = bucket_to_sector(ca,
-                                       ja->buckets[ja->cur_idx]) +
-                                       ca->mi.bucket_size -
-                                       ja->sectors_free,
-                                 .dev = ca->dev_idx,
-               });
-
-               ja->sectors_free -= sectors;
-               ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-
-               *replicas += ca->mi.durability;
-
-               if (*replicas >= replicas_want)
-                       break;
-       }
-}
-
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
-                              unsigned *replicas)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct bch_devs_mask devs;
-       struct dev_alloc_list devs_sorted;
-       unsigned sectors = vstruct_sectors(w->data, c->block_bits);
-       unsigned target = c->opts.metadata_target ?:
-               c->opts.foreground_target;
-       unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas);
-       unsigned replicas_need = min_t(unsigned, replicas_want,
-                                      READ_ONCE(c->opts.metadata_replicas_required));
-       bool advance_done = false;
-
-retry_target:
-       devs = target_rw_devs(c, BCH_DATA_journal, target);
-       bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted);
-retry_alloc:
-       __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);
-
-       if (likely(*replicas >= replicas_want))
-               goto done;
-
-       if (!advance_done) {
-               journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq);
-               advance_done = true;
-               goto retry_alloc;
-       }
-
-       if (*replicas < replicas_want && target) {
-               /* Retry from all devices: */
-               target = 0;
-               advance_done = false;
-               goto retry_target;
-       }
-done:
-       BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
-
-#if 0
-       /*
-        * XXX: we need a way to alert the user when we go degraded for any
-        * reason
-        */
-       if (*replicas < min(replicas_want,
-                           dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) {
-       }
-#endif
-
-       return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
-}
-
-static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-       /* we aren't holding j->lock: */
-       unsigned new_size = READ_ONCE(j->buf_size_want);
-       void *new_buf;
-
-       if (buf->buf_size >= new_size)
-               return;
-
-       size_t btree_write_buffer_size = new_size / 64;
-
-       if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
-               return;
-
-       new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
-       if (!new_buf)
-               return;
-
-       memcpy(new_buf, buf->data, buf->buf_size);
-
-       spin_lock(&j->lock);
-       swap(buf->data,         new_buf);
-       swap(buf->buf_size,     new_size);
-       spin_unlock(&j->lock);
-
-       kvfree(new_buf);
-}
-
-static CLOSURE_CALLBACK(journal_write_done)
-{
-       closure_type(w, struct journal_buf, io);
-       struct journal *j = container_of(w, struct journal, buf[w->idx]);
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       union bch_replicas_padded replicas;
-       u64 seq = le64_to_cpu(w->data->seq);
-       int err = 0;
-
-       bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
-                              ? j->flush_write_time
-                              : j->noflush_write_time, j->write_start_time);
-
-       if (!w->devs_written.nr) {
-               err = bch_err_throw(c, journal_write_err);
-       } else {
-               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
-                                        w->devs_written);
-               err = bch2_mark_replicas(c, &replicas.e);
-       }
-
-       if (err && !bch2_journal_error(j)) {
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-
-               if (err == -BCH_ERR_journal_write_err)
-                       prt_printf(&buf, "unable to write journal to sufficient devices\n");
-               else
-                       prt_printf(&buf, "journal write error marking replicas: %s\n",
-                                  bch2_err_str(err));
-
-               bch2_fs_emergency_read_only2(c, &buf);
-
-               bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       closure_debug_destroy(cl);
-
-       spin_lock(&j->lock);
-       if (seq >= j->pin.front)
-               journal_seq_pin(j, seq)->devs = w->devs_written;
-       if (err && (!j->err_seq || seq < j->err_seq))
-               j->err_seq      = seq;
-       w->write_done = true;
-
-       if (!j->free_buf || j->free_buf_size < w->buf_size) {
-               swap(j->free_buf,       w->data);
-               swap(j->free_buf_size,  w->buf_size);
-       }
-
-       if (w->data) {
-               void *buf = w->data;
-               w->data = NULL;
-               w->buf_size = 0;
-
-               spin_unlock(&j->lock);
-               kvfree(buf);
-               spin_lock(&j->lock);
-       }
-
-       bool completed = false;
-       bool do_discards = false;
-
-       for (seq = journal_last_unwritten_seq(j);
-            seq <= journal_cur_seq(j);
-            seq++) {
-               w = j->buf + (seq & JOURNAL_BUF_MASK);
-               if (!w->write_done)
-                       break;
-
-               if (!j->err_seq && !w->noflush) {
-                       j->flushed_seq_ondisk = seq;
-                       j->last_seq_ondisk = w->last_seq;
-
-                       closure_wake_up(&c->freelist_wait);
-                       bch2_reset_alloc_cursors(c);
-                       do_discards = true;
-               }
-
-               j->seq_ondisk = seq;
-
-               /*
-                * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
-                * more buckets:
-                *
-                * Must come before signaling write completion, for
-                * bch2_fs_journal_stop():
-                */
-               if (j->watermark != BCH_WATERMARK_stripe)
-                       journal_reclaim_kick(&c->journal);
-
-               closure_wake_up(&w->wait);
-               completed = true;
-       }
-
-       if (completed) {
-               bch2_journal_reclaim_fast(j);
-               bch2_journal_space_available(j);
-
-               track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
-
-               journal_wake(j);
-       }
-
-       if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
-           j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
-               struct journal_buf *buf = journal_cur_buf(j);
-               long delta = buf->expires - jiffies;
-
-               /*
-                * We don't close a journal entry to write it while there's
-                * previous entries still in flight - the current journal entry
-                * might want to be written now:
-                */
-               mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
-       }
-
-       /*
-        * We don't typically trigger journal writes from her - the next journal
-        * write will be triggered immediately after the previous one is
-        * allocated, in bch2_journal_write() - but the journal write error path
-        * is special:
-        */
-       bch2_journal_do_writes(j);
-       spin_unlock(&j->lock);
-
-       if (do_discards)
-               bch2_do_discards(c);
-}
-
-static void journal_write_endio(struct bio *bio)
-{
-       struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
-       struct bch_dev *ca = jbio->ca;
-       struct journal *j = &ca->fs->journal;
-       struct journal_buf *w = j->buf + jbio->buf_idx;
-
-       bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
-                                  jbio->submit_time, !bio->bi_status);
-
-       if (bio->bi_status) {
-               bch_err_dev_ratelimited(ca,
-                              "error writing journal entry %llu: %s",
-                              le64_to_cpu(w->data->seq),
-                              bch2_blk_status_to_str(bio->bi_status));
-
-               unsigned long flags;
-               spin_lock_irqsave(&j->err_lock, flags);
-               bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
-               spin_unlock_irqrestore(&j->err_lock, flags);
-       }
-
-       closure_put(&w->io);
-       enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
-}
-
-static CLOSURE_CALLBACK(journal_write_submit)
-{
-       closure_type(w, struct journal_buf, io);
-       struct journal *j = container_of(w, struct journal, buf[w->idx]);
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       unsigned sectors = vstruct_sectors(w->data, c->block_bits);
-
-       extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-               struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-
-               this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
-                            sectors);
-
-               struct journal_device *ja = &ca->journal;
-               struct journal_bio *jbio = ja->bio[w->idx];
-               struct bio *bio = &jbio->bio;
-
-               jbio->submit_time       = local_clock();
-
-               bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
-               bio->bi_iter.bi_sector  = ptr->offset;
-               bio->bi_end_io          = journal_write_endio;
-               bio->bi_private         = ca;
-               bio->bi_ioprio          = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0);
-
-               BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
-               ca->prev_journal_sector = bio->bi_iter.bi_sector;
-
-               if (!JSET_NO_FLUSH(w->data))
-                       bio->bi_opf    |= REQ_FUA;
-               if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
-                       bio->bi_opf    |= REQ_PREFLUSH;
-
-               bch2_bio_map(bio, w->data, sectors << 9);
-
-               trace_and_count(c, journal_write, bio);
-               closure_bio_submit(bio, cl);
-
-               ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-       }
-
-       continue_at(cl, journal_write_done, j->wq);
-}
-
-static CLOSURE_CALLBACK(journal_write_preflush)
-{
-       closure_type(w, struct journal_buf, io);
-       struct journal *j = container_of(w, struct journal, buf[w->idx]);
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-       /*
-        * Wait for previous journal writes to comelete; they won't necessarily
-        * be flushed if they're still in flight
-        */
-       if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
-               spin_lock(&j->lock);
-               if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
-                       closure_wait(&j->async_wait, cl);
-                       spin_unlock(&j->lock);
-                       continue_at(cl, journal_write_preflush, j->wq);
-                       return;
-               }
-               spin_unlock(&j->lock);
-       }
-
-       if (w->separate_flush) {
-               for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) {
-                       enumerated_ref_get(&ca->io_ref[WRITE],
-                                          BCH_DEV_WRITE_REF_journal_write);
-
-                       struct journal_device *ja = &ca->journal;
-                       struct bio *bio = &ja->bio[w->idx]->bio;
-                       bio_reset(bio, ca->disk_sb.bdev,
-                                 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
-                       bio->bi_end_io          = journal_write_endio;
-                       bio->bi_private         = ca;
-                       closure_bio_submit(bio, cl);
-               }
-
-               continue_at(cl, journal_write_submit, j->wq);
-       } else {
-               /*
-                * no need to punt to another work item if we're not waiting on
-                * preflushes
-                */
-               journal_write_submit(&cl->work);
-       }
-}
-
-static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct jset_entry *start, *end;
-       struct jset *jset = w->data;
-       struct journal_keys_to_wb wb = { NULL };
-       unsigned u64s;
-       unsigned long btree_roots_have = 0;
-       u64 seq = le64_to_cpu(jset->seq);
-       int ret;
-
-       /*
-        * Simple compaction, dropping empty jset_entries (from journal
-        * reservations that weren't fully used) and merging jset_entries that
-        * can be.
-        *
-        * If we wanted to be really fancy here, we could sort all the keys in
-        * the jset and drop keys that were overwritten - probably not worth it:
-        */
-       vstruct_for_each(jset, i) {
-               unsigned u64s = le16_to_cpu(i->u64s);
-
-               /* Empty entry: */
-               if (!u64s)
-                       continue;
-
-               /*
-                * New btree roots are set by journalling them; when the journal
-                * entry gets written we have to propagate them to
-                * c->btree_roots
-                *
-                * But, every journal entry we write has to contain all the
-                * btree roots (at least for now); so after we copy btree roots
-                * to c->btree_roots we have to get any missing btree roots and
-                * add them to this journal entry:
-                */
-               switch (i->type) {
-               case BCH_JSET_ENTRY_btree_root:
-                       bch2_journal_entry_to_btree_root(c, i);
-                       __set_bit(i->btree_id, &btree_roots_have);
-                       break;
-               case BCH_JSET_ENTRY_write_buffer_keys:
-                       EBUG_ON(!w->need_flush_to_write_buffer);
-
-                       if (!wb.wb)
-                               bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
-
-                       jset_entry_for_each_key(i, k) {
-                               ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
-                               if (ret) {
-                                       bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
-                                                           bch2_err_str(ret));
-                                       bch2_journal_keys_to_write_buffer_end(c, &wb);
-                                       return ret;
-                               }
-                       }
-                       i->type = BCH_JSET_ENTRY_btree_keys;
-                       break;
-               }
-       }
-
-       if (wb.wb) {
-               ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
-               if (ret) {
-                       bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
-                                           bch2_err_str(ret));
-                       return ret;
-               }
-       }
-
-       spin_lock(&c->journal.lock);
-       w->need_flush_to_write_buffer = false;
-       spin_unlock(&c->journal.lock);
-
-       start = end = vstruct_last(jset);
-
-       end     = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
-
-       struct jset_entry_datetime *d =
-               container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
-       d->entry.type   = BCH_JSET_ENTRY_datetime;
-       d->seconds      = cpu_to_le64(ktime_get_real_seconds());
-
-       bch2_journal_super_entries_add_common(c, &end, seq);
-       u64s    = (u64 *) end - (u64 *) start;
-
-       WARN_ON(u64s > j->entry_u64s_reserved);
-
-       le32_add_cpu(&jset->u64s, u64s);
-
-       unsigned sectors = vstruct_sectors(jset, c->block_bits);
-
-       if (sectors > w->sectors) {
-               bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
-                                   vstruct_bytes(jset), w->sectors << 9,
-                                   u64s, w->u64s_reserved, j->entry_u64s_reserved);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct jset *jset = w->data;
-       u64 seq = le64_to_cpu(jset->seq);
-       bool validate_before_checksum = false;
-       int ret = 0;
-
-       jset->magic             = cpu_to_le64(jset_magic(c));
-       jset->version           = cpu_to_le32(c->sb.version);
-
-       SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
-       SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
-
-       if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
-               j->last_empty_seq = seq;
-
-       if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
-               validate_before_checksum = true;
-
-       if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
-               validate_before_checksum = true;
-
-       if (validate_before_checksum &&
-           (ret = jset_validate(c, NULL, jset, 0, WRITE)))
-               return ret;
-
-       ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
-                   jset->encrypted_start,
-                   vstruct_end(jset) - (void *) jset->encrypted_start);
-       if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret)))
-               return ret;
-
-       jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
-                                 journal_nonce(jset), jset);
-
-       if (!validate_before_checksum &&
-           (ret = jset_validate(c, NULL, jset, 0, WRITE)))
-               return ret;
-
-       unsigned sectors = vstruct_sectors(jset, c->block_bits);
-       unsigned bytes  = vstruct_bytes(jset);
-       memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
-       return 0;
-}
-
-static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       int error = bch2_journal_error(j);
-
-       /*
-        * If the journal is in an error state - we did an emergency shutdown -
-        * we prefer to continue doing journal writes. We just mark them as
-        * noflush so they'll never be used, but they'll still be visible by the
-        * list_journal tool - this helps in debugging.
-        *
-        * There's a caveat: the first journal write after marking the
-        * superblock dirty must always be a flush write, because on startup
-        * from a clean shutdown we didn't necessarily read the journal and the
-        * new journal write might overwrite whatever was in the journal
-        * previously - we can't leave the journal without any flush writes in
-        * it.
-        *
-        * So if we're in an error state, and we're still starting up, we don't
-        * write anything at all.
-        */
-       if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
-               return error;
-
-       if (error ||
-           w->noflush ||
-           (!w->must_flush &&
-            time_before(jiffies, j->last_flush_write +
-                msecs_to_jiffies(c->opts.journal_flush_delay)) &&
-            test_bit(JOURNAL_may_skip_flush, &j->flags))) {
-               w->noflush = true;
-               SET_JSET_NO_FLUSH(w->data, true);
-               w->data->last_seq       = 0;
-               w->last_seq             = 0;
-
-               j->nr_noflush_writes++;
-       } else {
-               w->must_flush = true;
-               j->last_flush_write = jiffies;
-               j->nr_flush_writes++;
-               clear_bit(JOURNAL_need_flush_write, &j->flags);
-       }
-
-       return 0;
-}
-
-CLOSURE_CALLBACK(bch2_journal_write)
-{
-       closure_type(w, struct journal_buf, io);
-       struct journal *j = container_of(w, struct journal, buf[w->idx]);
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       union bch_replicas_padded replicas;
-       unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
-       int ret;
-
-       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-       BUG_ON(!w->write_started);
-       BUG_ON(w->write_allocated);
-       BUG_ON(w->write_done);
-
-       j->write_start_time = local_clock();
-
-       spin_lock(&j->lock);
-       if (nr_rw_members > 1)
-               w->separate_flush = true;
-
-       ret = bch2_journal_write_pick_flush(j, w);
-       spin_unlock(&j->lock);
-
-       if (unlikely(ret))
-               goto err;
-
-       mutex_lock(&j->buf_lock);
-       journal_buf_realloc(j, w);
-
-       ret = bch2_journal_write_prep(j, w);
-       mutex_unlock(&j->buf_lock);
-
-       if (unlikely(ret))
-               goto err;
-
-       unsigned replicas_allocated = 0;
-       while (1) {
-               ret = journal_write_alloc(j, w, &replicas_allocated);
-               if (!ret || !j->can_discard)
-                       break;
-
-               bch2_journal_do_discards(j);
-       }
-
-       if (unlikely(ret))
-               goto err_allocate_write;
-
-       ret = bch2_journal_write_checksum(j, w);
-       if (unlikely(ret))
-               goto err;
-
-       spin_lock(&j->lock);
-       /*
-        * write is allocated, no longer need to account for it in
-        * bch2_journal_space_available():
-        */
-       w->sectors = 0;
-       w->write_allocated = true;
-       j->entry_bytes_written += vstruct_bytes(w->data);
-
-       /*
-        * journal entry has been compacted and allocated, recalculate space
-        * available:
-        */
-       bch2_journal_space_available(j);
-       bch2_journal_do_writes(j);
-       spin_unlock(&j->lock);
-
-       w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
-
-       /*
-        * Mark journal replicas before we submit the write to guarantee
-        * recovery will find the journal entries after a crash.
-        */
-       bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
-                                w->devs_written);
-       ret = bch2_mark_replicas(c, &replicas.e);
-       if (ret)
-               goto err;
-
-       if (c->opts.nochanges)
-               goto no_io;
-
-       if (!JSET_NO_FLUSH(w->data))
-               continue_at(cl, journal_write_preflush, j->wq);
-       else
-               continue_at(cl, journal_write_submit, j->wq);
-       return;
-err_allocate_write:
-       if (!bch2_journal_error(j)) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_journal_debug_to_text(&buf, j);
-               prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
-                                         le64_to_cpu(w->data->seq),
-                                         vstruct_sectors(w->data, c->block_bits),
-                                         bch2_err_str(ret));
-               bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-       }
-err:
-       bch2_fatal_error(c);
-no_io:
-       extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-               struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-               enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
-       }
-
-       continue_at(cl, journal_write_done, j->wq);
-}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
deleted file mode 100644 (file)
index 6fa82c4..0000000
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_IO_H
-#define _BCACHEFS_JOURNAL_IO_H
-
-#include "darray.h"
-
-void bch2_journal_pos_from_member_info_set(struct bch_fs *);
-void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
-
-struct journal_ptr {
-       bool            csum_good;
-       struct bch_csum csum;
-       u8              dev;
-       u32             bucket;
-       u32             bucket_offset;
-       u64             sector;
-};
-
-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
-       DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
-
-       bool                    csum_good;
-       bool                    ignore_blacklisted;
-       bool                    ignore_not_dirty;
-       /* must be last: */
-       struct jset             j;
-};
-
-static inline bool journal_replay_ignore(struct journal_replay *i)
-{
-       return !i || i->ignore_blacklisted || i->ignore_not_dirty;
-}
-
-static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
-                                       struct jset_entry *entry, unsigned type)
-{
-       while (entry < vstruct_last(jset)) {
-               if (entry->type == type)
-                       return entry;
-
-               entry = vstruct_next(entry);
-       }
-
-       return NULL;
-}
-
-#define for_each_jset_entry_type(entry, jset, type)                    \
-       for (struct jset_entry *entry = (jset)->start;                  \
-            (entry = __jset_entry_type_next(jset, entry, type));       \
-            entry = vstruct_next(entry))
-
-#define jset_entry_for_each_key(_e, _k)                                        \
-       for (struct bkey_i *_k = (_e)->start;                           \
-            _k < vstruct_last(_e);                                     \
-            _k = bkey_next(_k))
-
-#define for_each_jset_key(k, entry, jset)                              \
-       for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\
-               jset_entry_for_each_key(entry, k)
-
-int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
-                               struct jset_entry *, unsigned, int,
-                               struct bkey_validate_context);
-void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
-                               struct jset_entry *);
-
-void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
-                              struct journal_replay *);
-
-int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
-
-CLOSURE_CALLBACK(bch2_journal_write);
-
-static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
-       struct jset_entry *entry = *end;
-       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
-       memset(entry, 0, u64s * sizeof(u64));
-       /*
-        * The u64s field counts from the start of data, ignoring the shared
-        * fields.
-        */
-       entry->u64s = cpu_to_le16(u64s - 1);
-
-       *end = vstruct_next(*end);
-       return entry;
-}
-
-#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
deleted file mode 100644 (file)
index 0042d43..0000000
+++ /dev/null
@@ -1,1037 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "trace.h"
-
-#include <linux/kthread.h>
-#include <linux/sched/mm.h>
-
-static bool __should_discard_bucket(struct journal *, struct journal_device *);
-
-/* Free space calculations: */
-
-static unsigned journal_space_from(struct journal_device *ja,
-                                  enum journal_space_from from)
-{
-       switch (from) {
-       case journal_space_discarded:
-               return ja->discard_idx;
-       case journal_space_clean_ondisk:
-               return ja->dirty_idx_ondisk;
-       case journal_space_clean:
-               return ja->dirty_idx;
-       default:
-               BUG();
-       }
-}
-
-unsigned bch2_journal_dev_buckets_available(struct journal *j,
-                                           struct journal_device *ja,
-                                           enum journal_space_from from)
-{
-       if (!ja->nr)
-               return 0;
-
-       unsigned available = (journal_space_from(ja, from) -
-                             ja->cur_idx - 1 + ja->nr) % ja->nr;
-
-       /*
-        * Don't use the last bucket unless writing the new last_seq
-        * will make another bucket available:
-        */
-       if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
-               --available;
-
-       return available;
-}
-
-void bch2_journal_set_watermark(struct journal *j)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       bool low_on_space = j->space[journal_space_clean].total * 4 <=
-               j->space[journal_space_total].total;
-       bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
-       bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
-       unsigned watermark = low_on_space || low_on_pin || low_on_wb
-               ? BCH_WATERMARK_reclaim
-               : BCH_WATERMARK_stripe;
-
-       if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
-           track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
-           track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
-               trace_and_count(c, journal_full, c);
-
-       mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin);
-
-       swap(watermark, j->watermark);
-       if (watermark > j->watermark)
-               journal_wake(j);
-}
-
-static struct journal_space
-journal_dev_space_available(struct journal *j, struct bch_dev *ca,
-                           enum journal_space_from from)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_device *ja = &ca->journal;
-       unsigned sectors, buckets, unwritten;
-       unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c));
-       u64 seq;
-
-       if (from == journal_space_total)
-               return (struct journal_space) {
-                       .next_entry     = bucket_size_aligned,
-                       .total          = bucket_size_aligned * ja->nr,
-               };
-
-       buckets = bch2_journal_dev_buckets_available(j, ja, from);
-       sectors = round_down(ja->sectors_free, block_sectors(c));
-
-       /*
-        * We that we don't allocate the space for a journal entry
-        * until we write it out - thus, account for it here:
-        */
-       for (seq = journal_last_unwritten_seq(j);
-            seq <= journal_cur_seq(j);
-            seq++) {
-               unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
-
-               if (!unwritten)
-                       continue;
-
-               /* entry won't fit on this device, skip: */
-               if (unwritten > bucket_size_aligned)
-                       continue;
-
-               if (unwritten >= sectors) {
-                       if (!buckets) {
-                               sectors = 0;
-                               break;
-                       }
-
-                       buckets--;
-                       sectors = bucket_size_aligned;
-               }
-
-               sectors -= unwritten;
-       }
-
-       if (sectors < ca->mi.bucket_size && buckets) {
-               buckets--;
-               sectors = bucket_size_aligned;
-       }
-
-       return (struct journal_space) {
-               .next_entry     = sectors,
-               .total          = sectors + buckets * bucket_size_aligned,
-       };
-}
-
-static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
-                           enum journal_space_from from)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       unsigned pos, nr_devs = 0;
-       struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
-       unsigned min_bucket_size = U32_MAX;
-
-       BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
-
-       for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
-               if (!ca->journal.nr ||
-                   !ca->mi.durability)
-                       continue;
-
-               min_bucket_size = min(min_bucket_size, ca->mi.bucket_size);
-
-               space = journal_dev_space_available(j, ca, from);
-               if (!space.next_entry)
-                       continue;
-
-               for (pos = 0; pos < nr_devs; pos++)
-                       if (space.total > dev_space[pos].total)
-                               break;
-
-               array_insert_item(dev_space, nr_devs, pos, space);
-       }
-
-       if (nr_devs < nr_devs_want)
-               return (struct journal_space) { 0, 0 };
-
-       /*
-        * It's possible for bucket size to be misaligned w.r.t. the filesystem
-        * block size:
-        */
-       min_bucket_size = round_down(min_bucket_size, block_sectors(c));
-
-       /*
-        * We sorted largest to smallest, and we want the smallest out of the
-        * @nr_devs_want largest devices:
-        */
-       space = dev_space[nr_devs_want - 1];
-       space.next_entry = min(space.next_entry, min_bucket_size);
-       return space;
-}
-
-void bch2_journal_space_available(struct journal *j)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       unsigned clean, clean_ondisk, total;
-       unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
-                                      j->buf[1].buf_size >> 9);
-       unsigned nr_online = 0, nr_devs_want;
-       bool can_discard = false;
-       int ret = 0;
-
-       lockdep_assert_held(&j->lock);
-       guard(rcu)();
-
-       for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
-               struct journal_device *ja = &ca->journal;
-
-               if (!ja->nr)
-                       continue;
-
-               while (ja->dirty_idx != ja->cur_idx &&
-                      ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
-                       ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
-
-               while (ja->dirty_idx_ondisk != ja->dirty_idx &&
-                      ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
-                       ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
-
-               can_discard |= __should_discard_bucket(j, ja);
-
-               max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
-               nr_online++;
-       }
-
-       j->can_discard = can_discard;
-
-       if (nr_online < metadata_replicas_required(c)) {
-               if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) {
-                       struct printbuf buf = PRINTBUF;
-                       buf.atomic++;
-                       prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
-                                  "rw journal devs:", nr_online, metadata_replicas_required(c));
-
-                       for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
-                               prt_printf(&buf, " %s", ca->name);
-
-                       bch_err(c, "%s", buf.buf);
-                       printbuf_exit(&buf);
-               }
-               ret = bch_err_throw(c, insufficient_journal_devices);
-               goto out;
-       }
-
-       nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
-
-       for (unsigned i = 0; i < journal_space_nr; i++)
-               j->space[i] = __journal_space_available(j, nr_devs_want, i);
-
-       clean_ondisk    = j->space[journal_space_clean_ondisk].total;
-       clean           = j->space[journal_space_clean].total;
-       total           = j->space[journal_space_total].total;
-
-       if (!j->space[journal_space_discarded].next_entry)
-               ret = bch_err_throw(c, journal_full);
-
-       if ((j->space[journal_space_clean_ondisk].next_entry <
-            j->space[journal_space_clean_ondisk].total) &&
-           (clean - clean_ondisk <= total / 8) &&
-           (clean_ondisk * 2 > clean))
-               set_bit(JOURNAL_may_skip_flush, &j->flags);
-       else
-               clear_bit(JOURNAL_may_skip_flush, &j->flags);
-
-       bch2_journal_set_watermark(j);
-out:
-       j->cur_entry_sectors    = !ret
-               ? j->space[journal_space_discarded].next_entry
-               : 0;
-       j->cur_entry_error      = ret;
-
-       if (!ret)
-               journal_wake(j);
-}
-
-/* Discards - last part of journal reclaim: */
-
-static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
-       unsigned min_free = max(4, ja->nr / 8);
-
-       return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
-               min_free &&
-               ja->discard_idx != ja->dirty_idx_ondisk;
-}
-
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
-       spin_lock(&j->lock);
-       bool ret = __should_discard_bucket(j, ja);
-       spin_unlock(&j->lock);
-
-       return ret;
-}
-
-/*
- * Advance ja->discard_idx as long as it points to buckets that are no longer
- * dirty, issuing discards if necessary:
- */
-void bch2_journal_do_discards(struct journal *j)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-       mutex_lock(&j->discard_lock);
-
-       for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) {
-               struct journal_device *ja = &ca->journal;
-
-               while (should_discard_bucket(j, ja)) {
-                       if (!c->opts.nochanges &&
-                           bch2_discard_opt_enabled(c, ca) &&
-                           bdev_max_discard_sectors(ca->disk_sb.bdev))
-                               blkdev_issue_discard(ca->disk_sb.bdev,
-                                       bucket_to_sector(ca,
-                                               ja->buckets[ja->discard_idx]),
-                                       ca->mi.bucket_size, GFP_NOFS);
-
-                       spin_lock(&j->lock);
-                       ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
-
-                       bch2_journal_space_available(j);
-                       spin_unlock(&j->lock);
-               }
-       }
-
-       mutex_unlock(&j->discard_lock);
-}
-
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, holding it open to ensure it gets replayed during recovery:
- */
-
-void bch2_journal_reclaim_fast(struct journal *j)
-{
-       bool popped = false;
-
-       lockdep_assert_held(&j->lock);
-
-       /*
-        * Unpin journal entries whose reference counts reached zero, meaning
-        * all btree nodes got written out
-        */
-       while (!fifo_empty(&j->pin) &&
-              j->pin.front <= j->seq_ondisk &&
-              !atomic_read(&fifo_peek_front(&j->pin).count)) {
-               j->pin.front++;
-               popped = true;
-       }
-
-       if (popped) {
-               bch2_journal_space_available(j);
-               __closure_wake_up(&j->reclaim_flush_wait);
-       }
-}
-
-bool __bch2_journal_pin_put(struct journal *j, u64 seq)
-{
-       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
-       return atomic_dec_and_test(&pin_list->count);
-}
-
-void bch2_journal_pin_put(struct journal *j, u64 seq)
-{
-       if (__bch2_journal_pin_put(j, seq)) {
-               spin_lock(&j->lock);
-               bch2_journal_reclaim_fast(j);
-               spin_unlock(&j->lock);
-       }
-}
-
-static inline bool __journal_pin_drop(struct journal *j,
-                                     struct journal_entry_pin *pin)
-{
-       struct journal_entry_pin_list *pin_list;
-
-       if (!journal_pin_active(pin))
-               return false;
-
-       if (j->flush_in_progress == pin)
-               j->flush_in_progress_dropped = true;
-
-       pin_list = journal_seq_pin(j, pin->seq);
-       pin->seq = 0;
-       list_del_init(&pin->list);
-
-       if (j->reclaim_flush_wait.list.first)
-               __closure_wake_up(&j->reclaim_flush_wait);
-
-       /*
-        * Unpinning a journal entry may make journal_next_bucket() succeed, if
-        * writing a new last_seq will now make another bucket available:
-        */
-       return atomic_dec_and_test(&pin_list->count) &&
-               pin_list == &fifo_peek_front(&j->pin);
-}
-
-void bch2_journal_pin_drop(struct journal *j,
-                          struct journal_entry_pin *pin)
-{
-       spin_lock(&j->lock);
-       if (__journal_pin_drop(j, pin))
-               bch2_journal_reclaim_fast(j);
-       spin_unlock(&j->lock);
-}
-
-static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
-                                             journal_pin_flush_fn fn)
-{
-       if (fn == bch2_btree_node_flush0 ||
-           fn == bch2_btree_node_flush1) {
-               unsigned idx = fn == bch2_btree_node_flush1;
-               struct btree *b = container_of(pin, struct btree, writes[idx].journal);
-
-               return JOURNAL_PIN_TYPE_btree0 - b->c.level;
-       } else if (fn == bch2_btree_key_cache_journal_flush)
-               return JOURNAL_PIN_TYPE_key_cache;
-       else
-               return JOURNAL_PIN_TYPE_other;
-}
-
-static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
-                         struct journal_entry_pin *pin,
-                         journal_pin_flush_fn flush_fn,
-                         enum journal_pin_type type)
-{
-       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
-       /*
-        * flush_fn is how we identify journal pins in debugfs, so must always
-        * exist, even if it doesn't do anything:
-        */
-       BUG_ON(!flush_fn);
-
-       atomic_inc(&pin_list->count);
-       pin->seq        = seq;
-       pin->flush      = flush_fn;
-
-       if (list_empty(&pin_list->unflushed[type]) &&
-           j->reclaim_flush_wait.list.first)
-               __closure_wake_up(&j->reclaim_flush_wait);
-
-       list_add(&pin->list, &pin_list->unflushed[type]);
-}
-
-void bch2_journal_pin_copy(struct journal *j,
-                          struct journal_entry_pin *dst,
-                          struct journal_entry_pin *src,
-                          journal_pin_flush_fn flush_fn)
-{
-       spin_lock(&j->lock);
-
-       u64 seq = READ_ONCE(src->seq);
-
-       if (seq < journal_last_seq(j)) {
-               /*
-                * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
-                * the src pin - with the pin dropped, the entry to pin might no
-                * longer to exist, but that means there's no longer anything to
-                * copy and we can bail out here:
-                */
-               spin_unlock(&j->lock);
-               return;
-       }
-
-       bool reclaim = __journal_pin_drop(j, dst);
-
-       bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
-
-       if (reclaim)
-               bch2_journal_reclaim_fast(j);
-
-       /*
-        * If the journal is currently full,  we might want to call flush_fn
-        * immediately:
-        */
-       if (seq == journal_last_seq(j))
-               journal_wake(j);
-       spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_set(struct journal *j, u64 seq,
-                         struct journal_entry_pin *pin,
-                         journal_pin_flush_fn flush_fn)
-{
-       spin_lock(&j->lock);
-
-       BUG_ON(seq < journal_last_seq(j));
-
-       bool reclaim = __journal_pin_drop(j, pin);
-
-       bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
-
-       if (reclaim)
-               bch2_journal_reclaim_fast(j);
-       /*
-        * If the journal is currently full,  we might want to call flush_fn
-        * immediately:
-        */
-       if (seq == journal_last_seq(j))
-               journal_wake(j);
-
-       spin_unlock(&j->lock);
-}
-
-/**
- * bch2_journal_pin_flush: ensure journal pin callback is no longer running
- * @j:         journal object
- * @pin:       pin to flush
- */
-void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
-{
-       BUG_ON(journal_pin_active(pin));
-
-       wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
-}
-
-/*
- * Journal reclaim: flush references to open journal entries to reclaim space in
- * the journal
- *
- * May be done by the journal code in the background as needed to free up space
- * for more journal entries, or as part of doing a clean shutdown, or to migrate
- * data off of a specific device:
- */
-
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j,
-                    u64 seq_to_flush,
-                    unsigned allowed_below_seq,
-                    unsigned allowed_above_seq,
-                    u64 *seq)
-{
-       struct journal_entry_pin_list *pin_list;
-       struct journal_entry_pin *ret = NULL;
-
-       fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
-               if (*seq > seq_to_flush && !allowed_above_seq)
-                       break;
-
-               for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
-                       if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
-                           (BIT(i) & allowed_above_seq)) {
-                               ret = list_first_entry_or_null(&pin_list->unflushed[i],
-                                       struct journal_entry_pin, list);
-                               if (ret)
-                                       return ret;
-                       }
-       }
-
-       return NULL;
-}
-
-/* returns true if we did work */
-static size_t journal_flush_pins(struct journal *j,
-                                u64 seq_to_flush,
-                                unsigned allowed_below_seq,
-                                unsigned allowed_above_seq,
-                                unsigned min_any,
-                                unsigned min_key_cache)
-{
-       struct journal_entry_pin *pin;
-       size_t nr_flushed = 0;
-       journal_pin_flush_fn flush_fn;
-       u64 seq;
-       int err;
-
-       lockdep_assert_held(&j->reclaim_lock);
-
-       while (1) {
-               unsigned allowed_above = allowed_above_seq;
-               unsigned allowed_below = allowed_below_seq;
-
-               if (min_any) {
-                       allowed_above |= ~0;
-                       allowed_below |= ~0;
-               }
-
-               if (min_key_cache) {
-                       allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
-                       allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
-               }
-
-               cond_resched();
-
-               j->last_flushed = jiffies;
-
-               spin_lock(&j->lock);
-               pin = journal_get_next_pin(j, seq_to_flush,
-                                          allowed_below,
-                                          allowed_above, &seq);
-               if (pin) {
-                       BUG_ON(j->flush_in_progress);
-                       j->flush_in_progress = pin;
-                       j->flush_in_progress_dropped = false;
-                       flush_fn = pin->flush;
-               }
-               spin_unlock(&j->lock);
-
-               if (!pin)
-                       break;
-
-               if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
-                       min_key_cache--;
-
-               if (min_any)
-                       min_any--;
-
-               err = flush_fn(j, pin, seq);
-
-               spin_lock(&j->lock);
-               /* Pin might have been dropped or rearmed: */
-               if (likely(!err && !j->flush_in_progress_dropped))
-                       list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
-               j->flush_in_progress = NULL;
-               j->flush_in_progress_dropped = false;
-               spin_unlock(&j->lock);
-
-               wake_up(&j->pin_flush_wait);
-
-               if (err)
-                       break;
-
-               nr_flushed++;
-       }
-
-       return nr_flushed;
-}
-
-static u64 journal_seq_to_flush(struct journal *j)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       u64 seq_to_flush = 0;
-
-       guard(spinlock)(&j->lock);
-       guard(rcu)();
-
-       for_each_rw_member_rcu(c, ca) {
-               struct journal_device *ja = &ca->journal;
-               unsigned nr_buckets, bucket_to_flush;
-
-               if (!ja->nr)
-                       continue;
-
-               /* Try to keep the journal at most half full: */
-               nr_buckets = ja->nr / 2;
-
-               bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
-               seq_to_flush = max(seq_to_flush,
-                                  ja->bucket_seq[bucket_to_flush]);
-       }
-
-       /* Also flush if the pin fifo is more than half full */
-       return max_t(s64, seq_to_flush,
-                    (s64) journal_cur_seq(j) -
-                    (j->pin.size >> 1));
-}
-
-/**
- * __bch2_journal_reclaim - free up journal buckets
- * @j:         journal object
- * @direct:    direct or background reclaim?
- * @kicked:    requested to run since we last ran?
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct btree_cache *bc = &c->btree_cache;
-       bool kthread = (current->flags & PF_KTHREAD) != 0;
-       u64 seq_to_flush;
-       size_t min_nr, min_key_cache, nr_flushed;
-       unsigned flags;
-       int ret = 0;
-
-       /*
-        * We can't invoke memory reclaim while holding the reclaim_lock -
-        * journal reclaim is required to make progress for memory reclaim
-        * (cleaning the caches), so we can't get stuck in memory reclaim while
-        * we're holding the reclaim lock:
-        */
-       lockdep_assert_held(&j->reclaim_lock);
-       flags = memalloc_noreclaim_save();
-
-       do {
-               if (kthread && kthread_should_stop())
-                       break;
-
-               ret = bch2_journal_error(j);
-               if (ret)
-                       break;
-
-               /* XXX shove journal discards off to another thread */
-               bch2_journal_do_discards(j);
-
-               seq_to_flush = journal_seq_to_flush(j);
-               min_nr = 0;
-
-               /*
-                * If it's been longer than j->reclaim_delay_ms since we last flushed,
-                * make sure to flush at least one journal pin:
-                */
-               if (time_after(jiffies, j->last_flushed +
-                              msecs_to_jiffies(c->opts.journal_reclaim_delay)))
-                       min_nr = 1;
-
-               if (j->watermark != BCH_WATERMARK_stripe)
-                       min_nr = 1;
-
-               size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
-               if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
-                       min_nr = 1;
-
-               min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
-
-               trace_and_count(c, journal_reclaim_start, c,
-                               direct, kicked,
-                               min_nr, min_key_cache,
-                               atomic_long_read(&bc->nr_dirty), btree_cache_live,
-                               atomic_long_read(&c->btree_key_cache.nr_dirty),
-                               atomic_long_read(&c->btree_key_cache.nr_keys));
-
-               nr_flushed = journal_flush_pins(j, seq_to_flush,
-                                               ~0, 0,
-                                               min_nr, min_key_cache);
-
-               if (direct)
-                       j->nr_direct_reclaim += nr_flushed;
-               else
-                       j->nr_background_reclaim += nr_flushed;
-               trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
-
-               if (nr_flushed)
-                       wake_up(&j->reclaim_wait);
-       } while ((min_nr || min_key_cache) && nr_flushed && !direct);
-
-       memalloc_noreclaim_restore(flags);
-
-       return ret;
-}
-
-int bch2_journal_reclaim(struct journal *j)
-{
-       return __bch2_journal_reclaim(j, true, true);
-}
-
-static int bch2_journal_reclaim_thread(void *arg)
-{
-       struct journal *j = arg;
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       unsigned long delay, now;
-       bool journal_empty;
-       int ret = 0;
-
-       set_freezable();
-
-       j->last_flushed = jiffies;
-
-       while (!ret && !kthread_should_stop()) {
-               bool kicked = j->reclaim_kicked;
-
-               j->reclaim_kicked = false;
-
-               mutex_lock(&j->reclaim_lock);
-               ret = __bch2_journal_reclaim(j, false, kicked);
-               mutex_unlock(&j->reclaim_lock);
-
-               now = jiffies;
-               delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
-               j->next_reclaim = j->last_flushed + delay;
-
-               if (!time_in_range(j->next_reclaim, now, now + delay))
-                       j->next_reclaim = now + delay;
-
-               while (1) {
-                       set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
-                       if (kthread_should_stop())
-                               break;
-                       if (j->reclaim_kicked)
-                               break;
-
-                       spin_lock(&j->lock);
-                       journal_empty = fifo_empty(&j->pin);
-                       spin_unlock(&j->lock);
-
-                       long timeout = j->next_reclaim - jiffies;
-
-                       if (journal_empty)
-                               schedule();
-                       else if (timeout > 0)
-                               schedule_timeout(timeout);
-                       else
-                               break;
-               }
-               __set_current_state(TASK_RUNNING);
-       }
-
-       return 0;
-}
-
-void bch2_journal_reclaim_stop(struct journal *j)
-{
-       struct task_struct *p = j->reclaim_thread;
-
-       j->reclaim_thread = NULL;
-
-       if (p) {
-               kthread_stop(p);
-               put_task_struct(p);
-       }
-}
-
-int bch2_journal_reclaim_start(struct journal *j)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct task_struct *p;
-       int ret;
-
-       if (j->reclaim_thread)
-               return 0;
-
-       p = kthread_create(bch2_journal_reclaim_thread, j,
-                          "bch-reclaim/%s", c->name);
-       ret = PTR_ERR_OR_ZERO(p);
-       bch_err_msg(c, ret, "creating journal reclaim thread");
-       if (ret)
-               return ret;
-
-       get_task_struct(p);
-       j->reclaim_thread = p;
-       wake_up_process(p);
-       return 0;
-}
-
-static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
-                                       unsigned types)
-{
-       struct journal_entry_pin_list *pin_list;
-       u64 seq;
-
-       spin_lock(&j->lock);
-       fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
-               if (seq > seq_to_flush)
-                       break;
-
-               for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
-                       if ((BIT(i) & types) &&
-                           (!list_empty(&pin_list->unflushed[i]) ||
-                            !list_empty(&pin_list->flushed[i]))) {
-                               spin_unlock(&j->lock);
-                               return true;
-                       }
-       }
-       spin_unlock(&j->lock);
-
-       return false;
-}
-
-static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
-                                                unsigned types)
-{
-       return  journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
-               journal_pins_still_flushing(j, seq_to_flush, types);
-}
-
-static int journal_flush_done(struct journal *j, u64 seq_to_flush,
-                             bool *did_work)
-{
-       int ret = 0;
-
-       ret = bch2_journal_error(j);
-       if (ret)
-               return ret;
-
-       mutex_lock(&j->reclaim_lock);
-
-       for (int type = JOURNAL_PIN_TYPE_NR - 1;
-            type >= 0;
-            --type)
-               if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
-                       *did_work = true;
-                       goto unlock;
-               }
-
-       if (seq_to_flush > journal_cur_seq(j))
-               bch2_journal_entry_close(j);
-
-       spin_lock(&j->lock);
-       /*
-        * If journal replay hasn't completed, the unreplayed journal entries
-        * hold refs on their corresponding sequence numbers
-        */
-       ret = !test_bit(JOURNAL_replay_done, &j->flags) ||
-               journal_last_seq(j) > seq_to_flush ||
-               !fifo_used(&j->pin);
-
-       spin_unlock(&j->lock);
-unlock:
-       mutex_unlock(&j->reclaim_lock);
-
-       return ret;
-}
-
-bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
-{
-       /* time_stats this */
-       bool did_work = false;
-
-       if (!test_bit(JOURNAL_running, &j->flags))
-               return false;
-
-       closure_wait_event(&j->reclaim_flush_wait,
-               journal_flush_done(j, seq_to_flush, &did_work));
-
-       return did_work;
-}
-
-int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_entry_pin_list *p;
-       u64 iter, seq = 0;
-       int ret = 0;
-
-       spin_lock(&j->lock);
-       fifo_for_each_entry_ptr(p, &j->pin, iter)
-               if (dev_idx >= 0
-                   ? bch2_dev_list_has_dev(p->devs, dev_idx)
-                   : p->devs.nr < c->opts.metadata_replicas)
-                       seq = iter;
-       spin_unlock(&j->lock);
-
-       bch2_journal_flush_pins(j, seq);
-
-       ret = bch2_journal_error(j);
-       if (ret)
-               return ret;
-
-       mutex_lock(&c->replicas_gc_lock);
-       bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
-
-       /*
-        * Now that we've populated replicas_gc, write to the journal to mark
-        * active journal devices. This handles the case where the journal might
-        * be empty. Otherwise we could clear all journal replicas and
-        * temporarily put the fs into an unrecoverable state. Journal recovery
-        * expects to find devices marked for journal data on unclean mount.
-        */
-       ret = bch2_journal_meta(&c->journal);
-       if (ret)
-               goto err;
-
-       seq = 0;
-       spin_lock(&j->lock);
-       while (!ret) {
-               union bch_replicas_padded replicas;
-
-               seq = max(seq, journal_last_seq(j));
-               if (seq >= j->pin.back)
-                       break;
-               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
-                                        journal_seq_pin(j, seq)->devs);
-               seq++;
-
-               if (replicas.e.nr_devs) {
-                       spin_unlock(&j->lock);
-                       ret = bch2_mark_replicas(c, &replicas.e);
-                       spin_lock(&j->lock);
-               }
-       }
-       spin_unlock(&j->lock);
-err:
-       ret = bch2_replicas_gc_end(c, ret);
-       mutex_unlock(&c->replicas_gc_lock);
-
-       return ret;
-}
-
-bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
-{
-       struct journal_entry_pin_list *pin_list;
-       struct journal_entry_pin *pin;
-
-       spin_lock(&j->lock);
-       if (!test_bit(JOURNAL_running, &j->flags)) {
-               spin_unlock(&j->lock);
-               return true;
-       }
-
-       *seq = max(*seq, j->pin.front);
-
-       if (*seq >= j->pin.back) {
-               spin_unlock(&j->lock);
-               return true;
-       }
-
-       out->atomic++;
-
-       pin_list = journal_seq_pin(j, *seq);
-
-       prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
-       printbuf_indent_add(out, 2);
-
-       prt_printf(out, "unflushed:\n");
-       for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
-               list_for_each_entry(pin, &pin_list->unflushed[i], list)
-                       prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
-       prt_printf(out, "flushed:\n");
-       for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
-               list_for_each_entry(pin, &pin_list->flushed[i], list)
-                       prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
-       printbuf_indent_sub(out, 2);
-
-       --out->atomic;
-       spin_unlock(&j->lock);
-
-       return false;
-}
-
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
-{
-       u64 seq = 0;
-
-       while (!bch2_journal_seq_pins_to_text(out, j, &seq))
-               seq++;
-}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
deleted file mode 100644 (file)
index 0a73d71..0000000
+++ /dev/null
@@ -1,84 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
-#define _BCACHEFS_JOURNAL_RECLAIM_H
-
-#define JOURNAL_PIN    (32 * 1024)
-
-static inline void journal_reclaim_kick(struct journal *j)
-{
-       struct task_struct *p = READ_ONCE(j->reclaim_thread);
-
-       j->reclaim_kicked = true;
-       if (p)
-               wake_up_process(p);
-}
-
-unsigned bch2_journal_dev_buckets_available(struct journal *,
-                                           struct journal_device *,
-                                           enum journal_space_from);
-void bch2_journal_set_watermark(struct journal *);
-void bch2_journal_space_available(struct journal *);
-
-static inline bool journal_pin_active(struct journal_entry_pin *pin)
-{
-       return pin->seq != 0;
-}
-
-static inline struct journal_entry_pin_list *
-journal_seq_pin(struct journal *j, u64 seq)
-{
-       EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
-
-       return &j->pin.data[seq & j->pin.mask];
-}
-
-void bch2_journal_reclaim_fast(struct journal *);
-bool __bch2_journal_pin_put(struct journal *, u64);
-void bch2_journal_pin_put(struct journal *, u64);
-void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
-                         journal_pin_flush_fn);
-
-static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
-                                       struct journal_entry_pin *pin,
-                                       journal_pin_flush_fn flush_fn)
-{
-       if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
-               bch2_journal_pin_set(j, seq, pin, flush_fn);
-}
-
-void bch2_journal_pin_copy(struct journal *,
-                          struct journal_entry_pin *,
-                          struct journal_entry_pin *,
-                          journal_pin_flush_fn);
-
-static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
-                                          struct journal_entry_pin *pin,
-                                          journal_pin_flush_fn flush_fn)
-{
-       if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
-               bch2_journal_pin_set(j, seq, pin, flush_fn);
-}
-
-void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_do_discards(struct journal *);
-int bch2_journal_reclaim(struct journal *);
-
-void bch2_journal_reclaim_stop(struct journal *);
-int bch2_journal_reclaim_start(struct journal *);
-
-bool bch2_journal_flush_pins(struct journal *, u64);
-
-static inline bool bch2_journal_flush_all_pins(struct journal *j)
-{
-       return bch2_journal_flush_pins(j, U64_MAX);
-}
-
-int bch2_journal_flush_device_pins(struct journal *, int);
-
-void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
-bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
-
-#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
deleted file mode 100644 (file)
index 0cb9b93..0000000
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "journal_sb.h"
-#include "darray.h"
-
-#include <linux/sort.h>
-
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
-{
-       const u64 *l = _l;
-       const u64 *r = _r;
-
-       return cmp_int(*l, *r);
-}
-
-static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_journal *journal = field_to_type(f, journal);
-       struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
-       int ret = -BCH_ERR_invalid_sb_journal;
-       unsigned nr;
-       unsigned i;
-       u64 *b;
-
-       nr = bch2_nr_journal_buckets(journal);
-       if (!nr)
-               return 0;
-
-       b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
-       if (!b)
-               return -BCH_ERR_ENOMEM_sb_journal_validate;
-
-       for (i = 0; i < nr; i++)
-               b[i] = le64_to_cpu(journal->buckets[i]);
-
-       sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
-       if (!b[0]) {
-               prt_printf(err, "journal bucket at sector 0");
-               goto err;
-       }
-
-       if (b[0] < le16_to_cpu(m.first_bucket)) {
-               prt_printf(err, "journal bucket %llu before first bucket %u",
-                      b[0], le16_to_cpu(m.first_bucket));
-               goto err;
-       }
-
-       if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
-               prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-                      b[nr - 1], le64_to_cpu(m.nbuckets));
-               goto err;
-       }
-
-       for (i = 0; i + 1 < nr; i++)
-               if (b[i] == b[i + 1]) {
-                       prt_printf(err, "duplicate journal buckets %llu", b[i]);
-                       goto err;
-               }
-
-       ret = 0;
-err:
-       kfree(b);
-       return ret;
-}
-
-static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
-                                   struct bch_sb_field *f)
-{
-       struct bch_sb_field_journal *journal = field_to_type(f, journal);
-       unsigned i, nr = bch2_nr_journal_buckets(journal);
-
-       prt_printf(out, "Buckets: ");
-       for (i = 0; i < nr; i++)
-               prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
-       prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal = {
-       .validate       = bch2_sb_journal_validate,
-       .to_text        = bch2_sb_journal_to_text,
-};
-
-struct u64_range {
-       u64     start;
-       u64     end;
-};
-
-static int u64_range_cmp(const void *_l, const void *_r)
-{
-       const struct u64_range *l = _l;
-       const struct u64_range *r = _r;
-
-       return cmp_int(l->start, r->start);
-}
-
-static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
-       struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
-       int ret = -BCH_ERR_invalid_sb_journal;
-       u64 sum = 0;
-       unsigned nr;
-       unsigned i;
-       struct u64_range *b;
-
-       nr = bch2_sb_field_journal_v2_nr_entries(journal);
-       if (!nr)
-               return 0;
-
-       b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
-       if (!b)
-               return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
-
-       for (i = 0; i < nr; i++) {
-               b[i].start = le64_to_cpu(journal->d[i].start);
-               b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
-
-               if (b[i].end <= b[i].start) {
-                       prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
-                                  le64_to_cpu(journal->d[i].start),
-                                  le64_to_cpu(journal->d[i].nr));
-                       goto err;
-               }
-
-               sum += le64_to_cpu(journal->d[i].nr);
-       }
-
-       sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
-
-       if (!b[0].start) {
-               prt_printf(err, "journal bucket at sector 0");
-               goto err;
-       }
-
-       if (b[0].start < le16_to_cpu(m.first_bucket)) {
-               prt_printf(err, "journal bucket %llu before first bucket %u",
-                      b[0].start, le16_to_cpu(m.first_bucket));
-               goto err;
-       }
-
-       if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
-               prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-                      b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
-               goto err;
-       }
-
-       for (i = 0; i + 1 < nr; i++) {
-               if (b[i].end > b[i + 1].start) {
-                       prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
-                              b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
-                       goto err;
-               }
-       }
-
-       if (sum > UINT_MAX) {
-               prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
-               goto err;
-       }
-
-       ret = 0;
-err:
-       kfree(b);
-       return ret;
-}
-
-static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
-                                   struct bch_sb_field *f)
-{
-       struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
-       unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
-
-       prt_printf(out, "Buckets: ");
-       for (i = 0; i < nr; i++)
-               prt_printf(out, " %llu-%llu",
-                      le64_to_cpu(journal->d[i].start),
-                      le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
-       prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
-       .validate       = bch2_sb_journal_v2_validate,
-       .to_text        = bch2_sb_journal_v2_to_text,
-};
-
-int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
-                              u64 *buckets, unsigned nr)
-{
-       struct bch_sb_field_journal_v2 *j;
-       unsigned i, dst = 0, nr_compacted = 1;
-
-       if (c)
-               lockdep_assert_held(&c->sb_lock);
-
-       if (!nr) {
-               bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
-               bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
-               return 0;
-       }
-
-       for (i = 0; i + 1 < nr; i++)
-               if (buckets[i] + 1 != buckets[i + 1])
-                       nr_compacted++;
-
-       j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
-                        (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
-       if (!j)
-               return bch_err_throw(c, ENOSPC_sb_journal);
-
-       bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
-
-       j->d[dst].start = cpu_to_le64(buckets[0]);
-       j->d[dst].nr    = cpu_to_le64(1);
-
-       for (i = 1; i < nr; i++) {
-               if (buckets[i] == buckets[i - 1] + 1) {
-                       le64_add_cpu(&j->d[dst].nr, 1);
-               } else {
-                       dst++;
-                       j->d[dst].start = cpu_to_le64(buckets[i]);
-                       j->d[dst].nr    = cpu_to_le64(1);
-               }
-       }
-
-       BUG_ON(dst + 1 != nr_compacted);
-       return 0;
-}
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
deleted file mode 100644 (file)
index ba40a7e..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include "super-io.h"
-#include "vstructs.h"
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
-       return j
-               ? (__le64 *) vstruct_end(&j->field) - j->buckets
-               : 0;
-}
-
-static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
-{
-       if (!j)
-               return 0;
-
-       return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
-
-int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
deleted file mode 100644 (file)
index af4fe41..0000000
+++ /dev/null
@@ -1,264 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "eytzinger.h"
-#include "journal.h"
-#include "journal_seq_blacklist.h"
-#include "super-io.h"
-
-/*
- * journal_seq_blacklist machinery:
- *
- * To guarantee order of btree updates after a crash, we need to detect when a
- * btree node entry (bset) is newer than the newest journal entry that was
- * successfully written, and ignore it - effectively ignoring any btree updates
- * that didn't make it into the journal.
- *
- * If we didn't do this, we might have two btree nodes, a and b, both with
- * updates that weren't written to the journal yet: if b was updated after a,
- * but b was flushed and not a - oops; on recovery we'll find that the updates
- * to b happened, but not the updates to a that happened before it.
- *
- * Ignoring bsets that are newer than the newest journal entry is always safe,
- * because everything they contain will also have been journalled - and must
- * still be present in the journal on disk until a journal entry has been
- * written _after_ that bset was written.
- *
- * To accomplish this, bsets record the newest journal sequence number they
- * contain updates for; then, on startup, the btree code queries the journal
- * code to ask "Is this sequence number newer than the newest journal entry? If
- * so, ignore it."
- *
- * When this happens, we must blacklist that journal sequence number: the
- * journal must not write any entries with that sequence number, and it must
- * record that it was blacklisted so that a) on recovery we don't think we have
- * missing journal entries and b) so that the btree code continues to ignore
- * that bset, until that btree node is rewritten.
- */
-
-static unsigned sb_blacklist_u64s(unsigned nr)
-{
-       struct bch_sb_field_journal_seq_blacklist *bl;
-
-       return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
-}
-
-int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
-{
-       struct bch_sb_field_journal_seq_blacklist *bl;
-       unsigned i = 0, nr;
-       int ret = 0;
-
-       mutex_lock(&c->sb_lock);
-       bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
-       nr = blacklist_nr_entries(bl);
-
-       while (i < nr) {
-               struct journal_seq_blacklist_entry *e =
-                       bl->start + i;
-
-               if (end < le64_to_cpu(e->start))
-                       break;
-
-               if (start > le64_to_cpu(e->end)) {
-                       i++;
-                       continue;
-               }
-
-               /*
-                * Entry is contiguous or overlapping with new entry: merge it
-                * with new entry, and delete:
-                */
-
-               start   = min(start,    le64_to_cpu(e->start));
-               end     = max(end,      le64_to_cpu(e->end));
-               array_remove_item(bl->start, nr, i);
-       }
-
-       bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
-                                 sb_blacklist_u64s(nr + 1));
-       if (!bl) {
-               ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist);
-               goto out;
-       }
-
-       array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
-               .start  = cpu_to_le64(start),
-               .end    = cpu_to_le64(end),
-       }));
-       c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
-
-       ret = bch2_write_super(c);
-out:
-       mutex_unlock(&c->sb_lock);
-
-       return ret ?: bch2_blacklist_table_initialize(c);
-}
-
-static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
-{
-       const struct journal_seq_blacklist_table_entry *l = _l;
-       const struct journal_seq_blacklist_table_entry *r = _r;
-
-       return cmp_int(l->start, r->start);
-}
-
-bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
-                                    bool dirty)
-{
-       struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
-       struct journal_seq_blacklist_table_entry search = { .start = seq };
-       int idx;
-
-       if (!t)
-               return false;
-
-       idx = eytzinger0_find_le(t->entries, t->nr,
-                                sizeof(t->entries[0]),
-                                journal_seq_blacklist_table_cmp,
-                                &search);
-       if (idx < 0)
-               return false;
-
-       BUG_ON(t->entries[idx].start > seq);
-
-       if (seq >= t->entries[idx].end)
-               return false;
-
-       if (dirty)
-               t->entries[idx].dirty = true;
-       return true;
-}
-
-u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c)
-{
-       struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
-
-       if (!t || !t->nr)
-               return 0;
-
-       return t->entries[eytzinger0_last(t->nr)].end - 1;
-}
-
-int bch2_blacklist_table_initialize(struct bch_fs *c)
-{
-       struct bch_sb_field_journal_seq_blacklist *bl =
-               bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
-       struct journal_seq_blacklist_table *t;
-       unsigned i, nr = blacklist_nr_entries(bl);
-
-       if (!bl)
-               return 0;
-
-       t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
-       if (!t)
-               return bch_err_throw(c, ENOMEM_blacklist_table_init);
-
-       t->nr = nr;
-
-       for (i = 0; i < nr; i++) {
-               t->entries[i].start     = le64_to_cpu(bl->start[i].start);
-               t->entries[i].end       = le64_to_cpu(bl->start[i].end);
-       }
-
-       eytzinger0_sort(t->entries,
-                       t->nr,
-                       sizeof(t->entries[0]),
-                       journal_seq_blacklist_table_cmp,
-                       NULL);
-
-       kfree(c->journal_seq_blacklist_table);
-       c->journal_seq_blacklist_table = t;
-       return 0;
-}
-
-static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_journal_seq_blacklist *bl =
-               field_to_type(f, journal_seq_blacklist);
-       unsigned i, nr = blacklist_nr_entries(bl);
-
-       for (i = 0; i < nr; i++) {
-               struct journal_seq_blacklist_entry *e = bl->start + i;
-
-               if (le64_to_cpu(e->start) >=
-                   le64_to_cpu(e->end)) {
-                       prt_printf(err, "entry %u start >= end (%llu >= %llu)",
-                              i, le64_to_cpu(e->start), le64_to_cpu(e->end));
-                       return -BCH_ERR_invalid_sb_journal_seq_blacklist;
-               }
-
-               if (i + 1 < nr &&
-                   le64_to_cpu(e[0].end) >
-                   le64_to_cpu(e[1].start)) {
-                       prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
-                              i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
-                       return -BCH_ERR_invalid_sb_journal_seq_blacklist;
-               }
-       }
-
-       return 0;
-}
-
-static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
-                                                 struct bch_sb *sb,
-                                                 struct bch_sb_field *f)
-{
-       struct bch_sb_field_journal_seq_blacklist *bl =
-               field_to_type(f, journal_seq_blacklist);
-       struct journal_seq_blacklist_entry *i;
-       unsigned nr = blacklist_nr_entries(bl);
-
-       for (i = bl->start; i < bl->start + nr; i++) {
-               if (i != bl->start)
-                       prt_printf(out, " ");
-
-               prt_printf(out, "%llu-%llu",
-                      le64_to_cpu(i->start),
-                      le64_to_cpu(i->end));
-       }
-       prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
-       .validate       = bch2_sb_journal_seq_blacklist_validate,
-       .to_text        = bch2_sb_journal_seq_blacklist_to_text
-};
-
-bool bch2_blacklist_entries_gc(struct bch_fs *c)
-{
-       struct journal_seq_blacklist_entry *src, *dst;
-
-       struct bch_sb_field_journal_seq_blacklist *bl =
-               bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
-       if (!bl)
-               return false;
-
-       unsigned nr = blacklist_nr_entries(bl);
-       dst = bl->start;
-
-       struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
-       BUG_ON(nr != t->nr);
-
-       src = bl->start;
-       eytzinger0_for_each(i, nr) {
-               BUG_ON(t->entries[i].start      != le64_to_cpu(src->start));
-               BUG_ON(t->entries[i].end        != le64_to_cpu(src->end));
-
-               if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
-                       *dst++ = *src;
-               src++;
-       }
-
-       unsigned new_nr = dst - bl->start;
-       if (new_nr == nr)
-               return false;
-
-       bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr);
-
-       bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
-                                 new_nr ? sb_blacklist_u64s(new_nr) : 0);
-       BUG_ON(new_nr && !bl);
-       return true;
-}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
deleted file mode 100644 (file)
index f06942c..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
-#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
-
-static inline unsigned
-blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
-{
-       return bl
-               ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
-                  sizeof(struct journal_seq_blacklist_entry))
-               : 0;
-}
-
-bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
-u64 bch2_journal_last_blacklisted_seq(struct bch_fs *);
-int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
-int bch2_blacklist_table_initialize(struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
-
-bool bch2_blacklist_entries_gc(struct bch_fs *);
-
-#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h
deleted file mode 100644 (file)
index 2566b12..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
-#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
-
-struct journal_seq_blacklist_entry {
-       __le64                  start;
-       __le64                  end;
-};
-
-struct bch_sb_field_journal_seq_blacklist {
-       struct bch_sb_field     field;
-       struct journal_seq_blacklist_entry start[];
-};
-
-#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
deleted file mode 100644 (file)
index 51104bb..0000000
+++ /dev/null
@@ -1,342 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_TYPES_H
-#define _BCACHEFS_JOURNAL_TYPES_H
-
-#include <linux/cache.h>
-#include <linux/workqueue.h>
-
-#include "alloc_types.h"
-#include "super_types.h"
-#include "fifo.h"
-
-/* btree write buffer steals 8 bits for its own purposes: */
-#define JOURNAL_SEQ_MAX                ((1ULL << 56) - 1)
-
-#define JOURNAL_STATE_BUF_BITS 2
-#define JOURNAL_STATE_BUF_NR   (1U << JOURNAL_STATE_BUF_BITS)
-#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1)
-
-#define JOURNAL_BUF_BITS       4
-#define JOURNAL_BUF_NR         (1U << JOURNAL_BUF_BITS)
-#define JOURNAL_BUF_MASK       (JOURNAL_BUF_NR - 1)
-
-/*
- * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
- * the journal that are being staged or in flight.
- */
-struct journal_buf {
-       struct closure          io;
-       struct jset             *data;
-
-       __BKEY_PADDED(key, BCH_REPLICAS_MAX);
-       struct bch_devs_list    devs_written;
-
-       struct closure_waitlist wait;
-       u64                     last_seq;       /* copy of data->last_seq */
-       long                    expires;
-       u64                     flush_time;
-
-       unsigned                buf_size;       /* size in bytes of @data */
-       unsigned                sectors;        /* maximum size for current entry */
-       unsigned                disk_sectors;   /* maximum size entry could have been, if
-                                                  buf_size was bigger */
-       unsigned                u64s_reserved;
-       bool                    noflush:1;      /* write has already been kicked off, and was noflush */
-       bool                    must_flush:1;   /* something wants a flush */
-       bool                    separate_flush:1;
-       bool                    need_flush_to_write_buffer:1;
-       bool                    write_started:1;
-       bool                    write_allocated:1;
-       bool                    write_done:1;
-       u8                      idx;
-};
-
-/*
- * Something that makes a journal entry dirty - i.e. a btree node that has to be
- * flushed:
- */
-
-enum journal_pin_type {
-       JOURNAL_PIN_TYPE_btree3,
-       JOURNAL_PIN_TYPE_btree2,
-       JOURNAL_PIN_TYPE_btree1,
-       JOURNAL_PIN_TYPE_btree0,
-       JOURNAL_PIN_TYPE_key_cache,
-       JOURNAL_PIN_TYPE_other,
-       JOURNAL_PIN_TYPE_NR,
-};
-
-struct journal_entry_pin_list {
-       struct list_head                unflushed[JOURNAL_PIN_TYPE_NR];
-       struct list_head                flushed[JOURNAL_PIN_TYPE_NR];
-       atomic_t                        count;
-       struct bch_devs_list            devs;
-};
-
-struct journal;
-struct journal_entry_pin;
-typedef int (*journal_pin_flush_fn)(struct journal *j,
-                               struct journal_entry_pin *, u64);
-
-struct journal_entry_pin {
-       struct list_head                list;
-       journal_pin_flush_fn            flush;
-       u64                             seq;
-};
-
-struct journal_res {
-       bool                    ref;
-       u16                     u64s;
-       u32                     offset;
-       u64                     seq;
-};
-
-union journal_res_state {
-       struct {
-               atomic64_t      counter;
-       };
-
-       struct {
-               u64             v;
-       };
-
-       struct {
-               u64             cur_entry_offset:22,
-                               idx:2,
-                               buf0_count:10,
-                               buf1_count:10,
-                               buf2_count:10,
-                               buf3_count:10;
-       };
-};
-
-/* bytes: */
-#define JOURNAL_ENTRY_SIZE_MIN         (64U << 10) /* 64k */
-#define JOURNAL_ENTRY_SIZE_MAX         (4U  << 22) /* 16M */
-
-/*
- * We stash some journal state as sentinal values in cur_entry_offset:
- * note - cur_entry_offset is in units of u64s
- */
-#define JOURNAL_ENTRY_OFFSET_MAX       ((1U << 22) - 1)
-
-#define JOURNAL_ENTRY_BLOCKED_VAL      (JOURNAL_ENTRY_OFFSET_MAX - 2)
-#define JOURNAL_ENTRY_CLOSED_VAL       (JOURNAL_ENTRY_OFFSET_MAX - 1)
-#define JOURNAL_ENTRY_ERROR_VAL                (JOURNAL_ENTRY_OFFSET_MAX)
-
-struct journal_space {
-       /* Units of 512 bytes sectors: */
-       unsigned        next_entry; /* How big the next journal entry can be */
-       unsigned        total;
-};
-
-enum journal_space_from {
-       journal_space_discarded,
-       journal_space_clean_ondisk,
-       journal_space_clean,
-       journal_space_total,
-       journal_space_nr,
-};
-
-#define JOURNAL_FLAGS()                        \
-       x(replay_done)                  \
-       x(running)                      \
-       x(may_skip_flush)               \
-       x(need_flush_write)             \
-       x(space_low)
-
-enum journal_flags {
-#define x(n)   JOURNAL_##n,
-       JOURNAL_FLAGS()
-#undef x
-};
-
-struct journal_bio {
-       struct bch_dev          *ca;
-       unsigned                buf_idx;
-       u64                     submit_time;
-
-       struct bio              bio;
-};
-
-/* Embedded in struct bch_fs */
-struct journal {
-       /* Fastpath stuff up front: */
-       struct {
-
-       union journal_res_state reservations;
-       enum bch_watermark      watermark;
-
-       } __aligned(SMP_CACHE_BYTES);
-
-       unsigned long           flags;
-
-       /* Max size of current journal entry */
-       unsigned                cur_entry_u64s;
-       unsigned                cur_entry_sectors;
-
-       /* Reserved space in journal entry to be used just prior to write */
-       unsigned                entry_u64s_reserved;
-
-
-       /*
-        * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
-        * insufficient devices:
-        */
-       int                     cur_entry_error;
-       unsigned                cur_entry_offset_if_blocked;
-
-       unsigned                buf_size_want;
-       /*
-        * We may queue up some things to be journalled (log messages) before
-        * the journal has actually started - stash them here:
-        */
-       darray_u64              early_journal_entries;
-
-       /*
-        * Protects journal_buf->data, when accessing without a jorunal
-        * reservation: for synchronization between the btree write buffer code
-        * and the journal write path:
-        */
-       struct mutex            buf_lock;
-       /*
-        * Two journal entries -- one is currently open for new entries, the
-        * other is possibly being written out.
-        */
-       struct journal_buf      buf[JOURNAL_BUF_NR];
-       void                    *free_buf;
-       unsigned                free_buf_size;
-
-       spinlock_t              lock;
-
-       /* if nonzero, we may not open a new journal entry: */
-       unsigned                blocked;
-
-       /* Used when waiting because the journal was full */
-       wait_queue_head_t       wait;
-       struct closure_waitlist async_wait;
-       struct closure_waitlist reclaim_flush_wait;
-
-       struct delayed_work     write_work;
-       struct workqueue_struct *wq;
-
-       /* Sequence number of most recent journal entry (last entry in @pin) */
-       atomic64_t              seq;
-
-       u64                     seq_write_started;
-       /* seq, last_seq from the most recent journal entry successfully written */
-       u64                     seq_ondisk;
-       u64                     flushed_seq_ondisk;
-       u64                     flushing_seq;
-       u64                     last_seq_ondisk;
-       u64                     err_seq;
-       u64                     last_empty_seq;
-       u64                     oldest_seq_found_ondisk;
-
-       /*
-        * FIFO of journal entries whose btree updates have not yet been
-        * written out.
-        *
-        * Each entry is a reference count. The position in the FIFO is the
-        * entry's sequence number relative to @seq.
-        *
-        * The journal entry itself holds a reference count, put when the
-        * journal entry is written out. Each btree node modified by the journal
-        * entry also holds a reference count, put when the btree node is
-        * written.
-        *
-        * When a reference count reaches zero, the journal entry is no longer
-        * needed. When all journal entries in the oldest journal bucket are no
-        * longer needed, the bucket can be discarded and reused.
-        */
-       struct {
-               u64 front, back, size, mask;
-               struct journal_entry_pin_list *data;
-       }                       pin;
-
-       struct journal_space    space[journal_space_nr];
-
-       u64                     replay_journal_seq;
-       u64                     replay_journal_seq_end;
-
-       struct write_point      wp;
-       spinlock_t              err_lock;
-
-       struct mutex            reclaim_lock;
-       /*
-        * Used for waiting until journal reclaim has freed up space in the
-        * journal:
-        */
-       wait_queue_head_t       reclaim_wait;
-       struct task_struct      *reclaim_thread;
-       bool                    reclaim_kicked;
-       unsigned long           next_reclaim;
-       u64                     nr_direct_reclaim;
-       u64                     nr_background_reclaim;
-
-       unsigned long           last_flushed;
-       struct journal_entry_pin *flush_in_progress;
-       bool                    flush_in_progress_dropped;
-       wait_queue_head_t       pin_flush_wait;
-
-       /* protects advancing ja->discard_idx: */
-       struct mutex            discard_lock;
-       bool                    can_discard;
-
-       unsigned long           last_flush_write;
-
-       u64                     write_start_time;
-
-       u64                     nr_flush_writes;
-       u64                     nr_noflush_writes;
-       u64                     entry_bytes_written;
-
-       struct bch2_time_stats  *flush_write_time;
-       struct bch2_time_stats  *noflush_write_time;
-       struct bch2_time_stats  *flush_seq_time;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lockdep_map      res_map;
-#endif
-} __aligned(SMP_CACHE_BYTES);
-
-/*
- * Embedded in struct bch_dev. First three fields refer to the array of journal
- * buckets, in bch_sb.
- */
-struct journal_device {
-       /*
-        * For each journal bucket, contains the max sequence number of the
-        * journal writes it contains - so we know when a bucket can be reused.
-        */
-       u64                     *bucket_seq;
-
-       unsigned                sectors_free;
-
-       /*
-        * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
-        */
-       unsigned                discard_idx;            /* Next bucket to discard */
-       unsigned                dirty_idx_ondisk;
-       unsigned                dirty_idx;
-       unsigned                cur_idx;                /* Journal bucket we're currently writing to */
-       unsigned                nr;
-
-       u64                     *buckets;
-
-       /* Bio for journal reads/writes to this device */
-       struct journal_bio      *bio[JOURNAL_BUF_NR];
-
-       /* for bch_journal_read_device */
-       struct closure          read;
-       u64                     highest_seq_found;
-};
-
-/*
- * journal_entry_res - reserve space in every journal entry:
- */
-struct journal_entry_res {
-       unsigned                u64s;
-};
-
-#endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
deleted file mode 100644 (file)
index 1b828bd..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "keylist.h"
-
-int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
-                       size_t nr_inline_u64s, size_t new_u64s)
-{
-       size_t oldsize = bch2_keylist_u64s(l);
-       size_t newsize = oldsize + new_u64s;
-       u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
-       u64 *new_keys;
-
-       newsize = roundup_pow_of_two(newsize);
-
-       if (newsize <= nr_inline_u64s ||
-           (old_buf && roundup_pow_of_two(oldsize) == newsize))
-               return 0;
-
-       new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS);
-       if (!new_keys)
-               return -ENOMEM;
-
-       if (!old_buf)
-               memcpy_u64s(new_keys, inline_u64s, oldsize);
-
-       l->keys_p = new_keys;
-       l->top_p = new_keys + oldsize;
-
-       return 0;
-}
-
-void bch2_keylist_pop_front(struct keylist *l)
-{
-       l->top_p -= bch2_keylist_front(l)->k.u64s;
-
-       memmove_u64s_down(l->keys,
-                         bkey_next(l->keys),
-                         bch2_keylist_u64s(l));
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_verify_keylist_sorted(struct keylist *l)
-{
-       for_each_keylist_key(l, k)
-               BUG_ON(bkey_next(k) != l->top &&
-                      bpos_ge(k->k.p, bkey_next(k)->k.p));
-}
-#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
deleted file mode 100644 (file)
index e687e0e..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_KEYLIST_H
-#define _BCACHEFS_KEYLIST_H
-
-#include "keylist_types.h"
-
-int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
-void bch2_keylist_pop_front(struct keylist *);
-
-static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
-{
-       l->top_p = l->keys_p = inline_keys;
-}
-
-static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
-{
-       if (l->keys_p != inline_keys)
-               kfree(l->keys_p);
-}
-
-static inline void bch2_keylist_push(struct keylist *l)
-{
-       l->top = bkey_next(l->top);
-}
-
-static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
-{
-       bkey_copy(l->top, k);
-       bch2_keylist_push(l);
-}
-
-static inline bool bch2_keylist_empty(struct keylist *l)
-{
-       return l->top == l->keys;
-}
-
-static inline size_t bch2_keylist_u64s(struct keylist *l)
-{
-       return l->top_p - l->keys_p;
-}
-
-static inline size_t bch2_keylist_bytes(struct keylist *l)
-{
-       return bch2_keylist_u64s(l) * sizeof(u64);
-}
-
-static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
-{
-       return l->keys;
-}
-
-#define for_each_keylist_key(_keylist, _k)                     \
-       for (struct bkey_i *_k = (_keylist)->keys;              \
-            _k != (_keylist)->top;                             \
-            _k = bkey_next(_k))
-
-static inline u64 keylist_sectors(struct keylist *keys)
-{
-       u64 ret = 0;
-
-       for_each_keylist_key(keys, k)
-               ret += k->k.size;
-       return ret;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_verify_keylist_sorted(struct keylist *);
-#else
-static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
-#endif
-
-#endif /* _BCACHEFS_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
deleted file mode 100644 (file)
index 4b3ff7d..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_KEYLIST_TYPES_H
-#define _BCACHEFS_KEYLIST_TYPES_H
-
-struct keylist {
-       union {
-               struct bkey_i           *keys;
-               u64                     *keys_p;
-       };
-       union {
-               struct bkey_i           *top;
-               u64                     *top_p;
-       };
-};
-
-#endif /* _BCACHEFS_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
deleted file mode 100644 (file)
index 75f27ec..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "error.h"
-#include "io_misc.h"
-#include "logged_ops.h"
-#include "super.h"
-
-struct bch_logged_op_fn {
-       u8              type;
-       int             (*resume)(struct btree_trans *, struct bkey_i *);
-};
-
-static const struct bch_logged_op_fn logged_op_fns[] = {
-#define x(n)           {                                       \
-       .type           = KEY_TYPE_logged_op_##n,               \
-       .resume         = bch2_resume_logged_op_##n,            \
-},
-       BCH_LOGGED_OPS()
-#undef x
-};
-
-static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
-{
-       for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
-               if (logged_op_fns[i].type == type)
-                       return logged_op_fns + i;
-       return NULL;
-}
-
-static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
-                           struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       u32 restart_count = trans->restart_count;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags),
-                   trans, logged_op_but_clean,
-                   "filesystem marked as clean but have logged op\n%s",
-                   (bch2_bkey_val_to_text(&buf, c, k),
-                    buf.buf));
-
-       struct bkey_buf sk;
-       bch2_bkey_buf_init(&sk);
-       bch2_bkey_buf_reassemble(&sk, c, k);
-
-       const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type);
-       if (fn)
-               fn->resume(trans, sk.k);
-
-       ret = bch2_logged_op_finish(trans, sk.k);
-
-       bch2_bkey_buf_exit(&sk, c);
-fsck_err:
-       printbuf_exit(&buf);
-       return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-int bch2_resume_logged_ops(struct bch_fs *c)
-{
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_max(trans, iter,
-                                  BTREE_ID_logged_ops,
-                                  POS(LOGGED_OPS_INUM_logged_ops, 0),
-                                  POS(LOGGED_OPS_INUM_logged_ops, U64_MAX),
-                                  BTREE_ITER_prefetch, k,
-                       resume_logged_op(trans, &iter, k)));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
-{
-       struct btree_iter iter;
-       int ret = bch2_bkey_get_empty_slot(trans, &iter,
-                                BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX));
-       if (ret)
-               return ret;
-
-       k->k.p = iter.pos;
-
-       ret = bch2_trans_update(trans, &iter, k, 0);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
-{
-       return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                        __bch2_logged_op_start(trans, k));
-}
-
-int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
-{
-       int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                           bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
-       /*
-        * This needs to be a fatal error because we've left an unfinished
-        * operation in the logged ops btree.
-        *
-        * We should only ever see an error here if the filesystem has already
-        * been shut down, but make sure of that here:
-        */
-       if (ret) {
-               struct bch_fs *c = trans->c;
-               struct printbuf buf = PRINTBUF;
-
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-               bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
-                                   buf.buf, bch2_err_str(ret));
-               printbuf_exit(&buf);
-       }
-
-       return ret;
-}
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
deleted file mode 100644 (file)
index 30ae9ef..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LOGGED_OPS_H
-#define _BCACHEFS_LOGGED_OPS_H
-
-#include "bkey.h"
-
-#define BCH_LOGGED_OPS()                       \
-       x(truncate)                             \
-       x(finsert)
-
-static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
-{
-       return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
-}
-
-int bch2_resume_logged_ops(struct bch_fs *);
-int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
-int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
-
-#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
deleted file mode 100644 (file)
index cfb67c9..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
-#define _BCACHEFS_LOGGED_OPS_FORMAT_H
-
-enum logged_ops_inums {
-       LOGGED_OPS_INUM_logged_ops,
-       LOGGED_OPS_INUM_inode_cursors,
-};
-
-struct bch_logged_op_truncate {
-       struct bch_val          v;
-       __le32                  subvol;
-       __le32                  pad;
-       __le64                  inum;
-       __le64                  new_i_size;
-};
-
-enum logged_op_finsert_state {
-       LOGGED_OP_FINSERT_start,
-       LOGGED_OP_FINSERT_shift_extents,
-       LOGGED_OP_FINSERT_finish,
-};
-
-struct bch_logged_op_finsert {
-       struct bch_val          v;
-       __u8                    state;
-       __u8                    pad[3];
-       __le32                  subvol;
-       __le64                  inum;
-       __le64                  dst_offset;
-       __le64                  src_offset;
-       __le64                  pos;
-};
-
-#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
deleted file mode 100644 (file)
index 57b5b32..0000000
+++ /dev/null
@@ -1,223 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "bkey_buf.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "ec.h"
-#include "error.h"
-#include "lru.h"
-#include "recovery.h"
-
-/* KEY_TYPE_lru is obsolete: */
-int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k,
-                     struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       bkey_fsck_err_on(!lru_pos_time(k.k->p),
-                        c, lru_entry_at_time_0,
-                        "lru entry at time=0");
-fsck_err:
-       return ret;
-}
-
-void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
-                     struct bkey_s_c k)
-{
-       const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
-
-       prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
-}
-
-void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
-{
-       prt_printf(out, "%llu:%llu -> %llu:%llu",
-                  lru_pos_id(lru),
-                  lru_pos_time(lru),
-                  u64_to_bucket(lru.offset).inode,
-                  u64_to_bucket(lru.offset).offset);
-}
-
-static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
-                         u64 dev_bucket, u64 time, bool set)
-{
-       return time
-               ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
-                                             lru_pos(lru_id, dev_bucket, time), set)
-               : 0;
-}
-
-int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
-{
-       return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
-}
-
-int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
-{
-       return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
-}
-
-int __bch2_lru_change(struct btree_trans *trans,
-                     u16 lru_id, u64 dev_bucket,
-                     u64 old_time, u64 new_time)
-{
-       if (old_time == new_time)
-               return 0;
-
-       return  bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
-               bch2_lru_set(trans, lru_id, dev_bucket, new_time);
-}
-
-static const char * const bch2_lru_types[] = {
-#define x(n) #n,
-       BCH_LRU_TYPES()
-#undef x
-       NULL
-};
-
-int bch2_lru_check_set(struct btree_trans *trans,
-                      u16 lru_id,
-                      u64 dev_bucket,
-                      u64 time,
-                      struct bkey_s_c referring_k,
-                      struct bkey_buf *last_flushed)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       struct btree_iter lru_iter;
-       struct bkey_s_c lru_k =
-               bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
-                                  lru_pos(lru_id, dev_bucket, time), 0);
-       int ret = bkey_err(lru_k);
-       if (ret)
-               return ret;
-
-       if (lru_k.k->type != KEY_TYPE_set) {
-               ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed);
-               if (ret)
-                       goto err;
-
-               if (fsck_err(trans, alloc_key_to_missing_lru_entry,
-                            "missing %s lru entry\n%s",
-                            bch2_lru_types[lru_type(lru_k)],
-                            (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
-                       ret = bch2_lru_set(trans, lru_id, dev_bucket, time);
-                       if (ret)
-                               goto err;
-               }
-       }
-err:
-fsck_err:
-       bch2_trans_iter_exit(trans, &lru_iter);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k)
-{
-       enum bch_lru_type type = lru_type(lru_k);
-
-       switch (type) {
-       case BCH_LRU_read:
-       case BCH_LRU_fragmentation:
-               return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset));
-       case BCH_LRU_stripes:
-               return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset));
-       default:
-               BUG();
-       }
-}
-
-static u64 bkey_lru_type_idx(struct bch_fs *c,
-                            enum bch_lru_type type,
-                            struct bkey_s_c k)
-{
-       struct bch_alloc_v4 a_convert;
-       const struct bch_alloc_v4 *a;
-
-       switch (type) {
-       case BCH_LRU_read:
-               a = bch2_alloc_to_v4(k, &a_convert);
-               return alloc_lru_idx_read(*a);
-       case BCH_LRU_fragmentation: {
-               a = bch2_alloc_to_v4(k, &a_convert);
-
-               guard(rcu)();
-               struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
-               return ca
-                       ? alloc_lru_idx_fragmentation(*a, ca)
-                       : 0;
-       }
-       case BCH_LRU_stripes:
-               return k.k->type == KEY_TYPE_stripe
-                       ? stripe_lru_pos(bkey_s_c_to_stripe(k).v)
-                       : 0;
-       default:
-               BUG();
-       }
-}
-
-static int bch2_check_lru_key(struct btree_trans *trans,
-                             struct btree_iter *lru_iter,
-                             struct bkey_s_c lru_k,
-                             struct bkey_buf *last_flushed)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf1 = PRINTBUF;
-       struct printbuf buf2 = PRINTBUF;
-
-       struct bbpos bp = lru_pos_to_bp(lru_k);
-
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0);
-       int ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       enum bch_lru_type type = lru_type(lru_k);
-       u64 idx = bkey_lru_type_idx(c, type, k);
-
-       if (lru_pos_time(lru_k.k->p) != idx) {
-               ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
-               if (ret)
-                       goto err;
-
-               if (fsck_err(trans, lru_entry_bad,
-                            "incorrect lru entry: lru %s time %llu\n"
-                            "%s\n"
-                            "for %s",
-                            bch2_lru_types[type],
-                            lru_pos_time(lru_k.k->p),
-                            (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
-                            (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
-                       ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
-       }
-err:
-fsck_err:
-       bch2_trans_iter_exit(trans, &iter);
-       printbuf_exit(&buf2);
-       printbuf_exit(&buf1);
-       return ret;
-}
-
-int bch2_check_lrus(struct bch_fs *c)
-{
-       struct bkey_buf last_flushed;
-
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter,
-                               BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       bch2_check_lru_key(trans, &iter, k, &last_flushed)));
-
-       bch2_bkey_buf_exit(&last_flushed, c);
-       bch_err_fn(c, ret);
-       return ret;
-
-}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
deleted file mode 100644 (file)
index 8abd0aa..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LRU_H
-#define _BCACHEFS_LRU_H
-
-static inline u64 lru_pos_id(struct bpos pos)
-{
-       return pos.inode >> LRU_TIME_BITS;
-}
-
-static inline u64 lru_pos_time(struct bpos pos)
-{
-       return pos.inode & ~(~0ULL << LRU_TIME_BITS);
-}
-
-static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
-{
-       struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
-
-       EBUG_ON(time > LRU_TIME_MAX);
-       EBUG_ON(lru_pos_id(pos) != lru_id);
-       EBUG_ON(lru_pos_time(pos) != time);
-       EBUG_ON(pos.offset != dev_bucket);
-
-       return pos;
-}
-
-static inline enum bch_lru_type lru_type(struct bkey_s_c l)
-{
-       u16 lru_id = l.k->p.inode >> 48;
-
-       switch (lru_id) {
-       case BCH_LRU_BUCKET_FRAGMENTATION:
-               return BCH_LRU_fragmentation;
-       case BCH_LRU_STRIPE_FRAGMENTATION:
-               return BCH_LRU_stripes;
-       default:
-               return BCH_LRU_read;
-       }
-}
-
-int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
-void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
-
-#define bch2_bkey_ops_lru ((struct bkey_ops) { \
-       .key_validate   = bch2_lru_validate,    \
-       .val_to_text    = bch2_lru_to_text,     \
-       .min_val_size   = 8,                    \
-})
-
-int bch2_lru_del(struct btree_trans *, u16, u64, u64);
-int bch2_lru_set(struct btree_trans *, u16, u64, u64);
-int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
-
-static inline int bch2_lru_change(struct btree_trans *trans,
-                     u16 lru_id, u64 dev_bucket,
-                     u64 old_time, u64 new_time)
-{
-       return old_time != new_time
-               ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time)
-               : 0;
-}
-
-struct bkey_buf;
-int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *);
-
-int bch2_check_lrus(struct bch_fs *);
-
-#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
deleted file mode 100644 (file)
index b7392ad..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LRU_FORMAT_H
-#define _BCACHEFS_LRU_FORMAT_H
-
-struct bch_lru {
-       struct bch_val          v;
-       __le64                  idx;
-} __packed __aligned(8);
-
-#define BCH_LRU_TYPES()                \
-       x(read)                 \
-       x(fragmentation)        \
-       x(stripes)
-
-enum bch_lru_type {
-#define x(n) BCH_LRU_##n,
-       BCH_LRU_TYPES()
-#undef x
-};
-
-#define BCH_LRU_BUCKET_FRAGMENTATION   ((1U << 16) - 1)
-#define BCH_LRU_STRIPE_FRAGMENTATION   ((1U << 16) - 2)
-
-#define LRU_TIME_BITS                  48
-#define LRU_TIME_MAX                   ((1ULL << LRU_TIME_BITS) - 1)
-
-#endif /* _BCACHEFS_LRU_FORMAT_H */
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
deleted file mode 100644 (file)
index 0ea9f30..0000000
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Functions for incremental mean and variance.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Copyright Â© 2022 Daniel B. Hill
- *
- * Author: Daniel B. Hill <daniel@gluo.nz>
- *
- * Description:
- *
- * This is includes some incremental algorithms for mean and variance calculation
- *
- * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
- *
- * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
- *
- * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
- *
- * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
- * is deferred to these functions for performance reasons.
- *
- * see lib/math/mean_and_variance_test.c for examples of usage.
- *
- * DO NOT access the mean and variance fields of the weighted variants directly.
- * DO NOT change the weight after calling update.
- */
-
-#include <linux/bug.h>
-#include <linux/compiler.h>
-#include <linux/export.h>
-#include <linux/limits.h>
-#include <linux/math.h>
-#include <linux/math64.h>
-#include <linux/module.h>
-
-#include "mean_and_variance.h"
-
-u128_u u128_div(u128_u n, u64 d)
-{
-       u128_u r;
-       u64 rem;
-       u64 hi = u128_hi(n);
-       u64 lo = u128_lo(n);
-       u64  h =  hi & ((u64) U32_MAX  << 32);
-       u64  l = (hi &  (u64) U32_MAX) << 32;
-
-       r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
-       r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
-       r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
-       return r;
-}
-EXPORT_SYMBOL_GPL(u128_div);
-
-/**
- * mean_and_variance_get_mean() - get mean from @s
- * @s: mean and variance number of samples and their sums
- */
-s64 mean_and_variance_get_mean(struct mean_and_variance s)
-{
-       return s.n ? div64_u64(s.sum, s.n) : 0;
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
-
-/**
- * mean_and_variance_get_variance() -  get variance from @s1
- * @s1: mean and variance number of samples and sums
- *
- * see linked pdf equation 12.
- */
-u64 mean_and_variance_get_variance(struct mean_and_variance s1)
-{
-       if (s1.n) {
-               u128_u s2 = u128_div(s1.sum_squares, s1.n);
-               u64  s3 = abs(mean_and_variance_get_mean(s1));
-
-               return u128_lo(u128_sub(s2, u128_square(s3)));
-       } else {
-               return 0;
-       }
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
-
-/**
- * mean_and_variance_get_stddev() - get standard deviation from @s
- * @s: mean and variance number of samples and their sums
- */
-u32 mean_and_variance_get_stddev(struct mean_and_variance s)
-{
-       return int_sqrt64(mean_and_variance_get_variance(s));
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
-
-/**
- * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
- * @s: mean and variance number of samples and their sums
- * @x: new value to include in the &mean_and_variance_weighted
- * @initted: caller must track whether this is the first use or not
- * @weight: ewma weight
- *
- * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
- * values are stored bitshifted for performance and added precision.
- */
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
-               s64 x, bool initted, u8 weight)
-{
-       // previous weighted variance.
-       u8 w            = weight;
-       u64 var_w0      = s->variance;
-       // new value weighted.
-       s64 x_w         = x << w;
-       s64 diff_w      = x_w - s->mean;
-       s64 diff        = fast_divpow2(diff_w, w);
-       // new mean weighted.
-       s64 u_w1        = s->mean + diff;
-
-       if (!initted) {
-               s->mean = x_w;
-               s->variance = 0;
-       } else {
-               s->mean = u_w1;
-               s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
-       }
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
-
-/**
- * mean_and_variance_weighted_get_mean() - get mean from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
-               u8 weight)
-{
-       return fast_divpow2(s.mean, weight);
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
-
-/**
- * mean_and_variance_weighted_get_variance() -- get variance from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
-               u8 weight)
-{
-       // always positive don't need fast divpow2
-       return s.variance >> weight;
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
-
-/**
- * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
-               u8 weight)
-{
-       return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
-
-MODULE_AUTHOR("Daniel B. Hill");
-MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
deleted file mode 100644 (file)
index 47e4a3c..0000000
+++ /dev/null
@@ -1,203 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef MEAN_AND_VARIANCE_H_
-#define MEAN_AND_VARIANCE_H_
-
-#include <linux/types.h>
-#include <linux/limits.h>
-#include <linux/math.h>
-#include <linux/math64.h>
-
-#define SQRT_U64_MAX 4294967295ULL
-
-/*
- * u128_u: u128 user mode, because not all architectures support a real int128
- * type
- *
- * We don't use this version in userspace, because in userspace we link with
- * Rust and rustc has issues with u128.
- */
-
-#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
-
-typedef struct {
-       unsigned __int128 v;
-} __aligned(16) u128_u;
-
-static inline u128_u u64_to_u128(u64 a)
-{
-       return (u128_u) { .v = a };
-}
-
-static inline u64 u128_lo(u128_u a)
-{
-       return a.v;
-}
-
-static inline u64 u128_hi(u128_u a)
-{
-       return a.v >> 64;
-}
-
-static inline u128_u u128_add(u128_u a, u128_u b)
-{
-       a.v += b.v;
-       return a;
-}
-
-static inline u128_u u128_sub(u128_u a, u128_u b)
-{
-       a.v -= b.v;
-       return a;
-}
-
-static inline u128_u u128_shl(u128_u a, s8 shift)
-{
-       a.v <<= shift;
-       return a;
-}
-
-static inline u128_u u128_square(u64 a)
-{
-       u128_u b = u64_to_u128(a);
-
-       b.v *= b.v;
-       return b;
-}
-
-#else
-
-typedef struct {
-       u64 hi, lo;
-} __aligned(16) u128_u;
-
-/* conversions */
-
-static inline u128_u u64_to_u128(u64 a)
-{
-       return (u128_u) { .lo = a };
-}
-
-static inline u64 u128_lo(u128_u a)
-{
-       return a.lo;
-}
-
-static inline u64 u128_hi(u128_u a)
-{
-       return a.hi;
-}
-
-/* arithmetic */
-
-static inline u128_u u128_add(u128_u a, u128_u b)
-{
-       u128_u c;
-
-       c.lo = a.lo + b.lo;
-       c.hi = a.hi + b.hi + (c.lo < a.lo);
-       return c;
-}
-
-static inline u128_u u128_sub(u128_u a, u128_u b)
-{
-       u128_u c;
-
-       c.lo = a.lo - b.lo;
-       c.hi = a.hi - b.hi - (c.lo > a.lo);
-       return c;
-}
-
-static inline u128_u u128_shl(u128_u i, s8 shift)
-{
-       u128_u r;
-
-       r.lo = i.lo << (shift & 63);
-       if (shift < 64)
-               r.hi = (i.hi << (shift & 63)) | (i.lo >> (-shift & 63));
-       else {
-               r.hi = i.lo << (-shift & 63);
-               r.lo = 0;
-       }
-       return r;
-}
-
-static inline u128_u u128_square(u64 i)
-{
-       u128_u r;
-       u64  h = i >> 32, l = i & U32_MAX;
-
-       r =             u128_shl(u64_to_u128(h*h), 64);
-       r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
-       r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
-       r = u128_add(r,          u64_to_u128(l*l));
-       return r;
-}
-
-#endif
-
-static inline u128_u u64s_to_u128(u64 hi, u64 lo)
-{
-       u128_u c = u64_to_u128(hi);
-
-       c = u128_shl(c, 64);
-       c = u128_add(c, u64_to_u128(lo));
-       return c;
-}
-
-u128_u u128_div(u128_u n, u64 d);
-
-struct mean_and_variance {
-       s64     n;
-       s64     sum;
-       u128_u  sum_squares;
-};
-
-/* expontentially weighted variant */
-struct mean_and_variance_weighted {
-       s64     mean;
-       u64     variance;
-};
-
-/**
- * fast_divpow2() - fast approximation for n / (1 << d)
- * @n: numerator
- * @d: the power of 2 denominator.
- *
- * note: this rounds towards 0.
- */
-static inline s64 fast_divpow2(s64 n, u8 d)
-{
-       return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
-}
-
-/**
- * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
- * and return it.
- * @s1: the mean_and_variance to update.
- * @v1: the new sample.
- *
- * see linked pdf equation 12.
- */
-static inline void
-mean_and_variance_update(struct mean_and_variance *s, s64 v)
-{
-       s->n++;
-       s->sum += v;
-       s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v)));
-}
-
-s64 mean_and_variance_get_mean(struct mean_and_variance s);
-u64 mean_and_variance_get_variance(struct mean_and_variance s1);
-u32 mean_and_variance_get_stddev(struct mean_and_variance s);
-
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
-               s64 v, bool initted, u8 weight);
-
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
-               u8 weight);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
-               u8 weight);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
-               u8 weight);
-
-#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
deleted file mode 100644 (file)
index e9d9c02..0000000
+++ /dev/null
@@ -1,221 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <kunit/test.h>
-
-#include "mean_and_variance.h"
-
-#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
-
-static void mean_and_variance_basic_test(struct kunit *test)
-{
-       struct mean_and_variance s = {};
-
-       mean_and_variance_update(&s, 2);
-       mean_and_variance_update(&s, 2);
-
-       KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0);
-       KUNIT_EXPECT_EQ(test, s.n, 2);
-
-       mean_and_variance_update(&s, 4);
-       mean_and_variance_update(&s, 4);
-
-       KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1);
-       KUNIT_EXPECT_EQ(test, s.n, 4);
-}
-
-/*
- * Test values computed using a spreadsheet from the psuedocode at the bottom:
- * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
- */
-
-static void mean_and_variance_weighted_test(struct kunit *test)
-{
-       struct mean_and_variance_weighted s = { };
-
-       mean_and_variance_weighted_update(&s, 10, false, 2);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
-
-       mean_and_variance_weighted_update(&s, 20, true, 2);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
-
-       mean_and_variance_weighted_update(&s, 30, true, 2);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
-
-       s = (struct mean_and_variance_weighted) { };
-
-       mean_and_variance_weighted_update(&s, -10, false, 2);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
-
-       mean_and_variance_weighted_update(&s, -20, true, 2);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
-
-       mean_and_variance_weighted_update(&s, -30, true, 2);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
-}
-
-static void mean_and_variance_weighted_advanced_test(struct kunit *test)
-{
-       struct mean_and_variance_weighted s = { };
-       bool initted = false;
-       s64 i;
-
-       for (i = 10; i <= 100; i += 10) {
-               mean_and_variance_weighted_update(&s, i, initted, 8);
-               initted = true;
-       }
-
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
-
-       s = (struct mean_and_variance_weighted) { };
-       initted = false;
-
-       for (i = -10; i >= -100; i -= 10) {
-               mean_and_variance_weighted_update(&s, i, initted, 8);
-               initted = true;
-       }
-
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
-       KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
-}
-
-static void do_mean_and_variance_test(struct kunit *test,
-                                     s64 initial_value,
-                                     s64 initial_n,
-                                     s64 n,
-                                     unsigned weight,
-                                     s64 *data,
-                                     s64 *mean,
-                                     s64 *stddev,
-                                     s64 *weighted_mean,
-                                     s64 *weighted_stddev)
-{
-       struct mean_and_variance mv = {};
-       struct mean_and_variance_weighted vw = { };
-
-       for (unsigned i = 0; i < initial_n; i++) {
-               mean_and_variance_update(&mv, initial_value);
-               mean_and_variance_weighted_update(&vw, initial_value, false, weight);
-
-               KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),           initial_value);
-               KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),         0);
-               KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight),  initial_value);
-               KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
-       }
-
-       for (unsigned i = 0; i < n; i++) {
-               mean_and_variance_update(&mv, data[i]);
-               mean_and_variance_weighted_update(&vw, data[i], true, weight);
-
-               KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),           mean[i]);
-               KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),         stddev[i]);
-               KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight),  weighted_mean[i]);
-               KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
-       }
-
-       KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
-}
-
-/* Test behaviour with a single outlier, then back to steady state: */
-static void mean_and_variance_test_1(struct kunit *test)
-{
-       s64 d[]                 = { 100, 10, 10, 10, 10, 10, 10 };
-       s64 mean[]              = {  22, 21, 20, 19, 18, 17, 16 };
-       s64 stddev[]            = {  32, 29, 28, 27, 26, 25, 24 };
-       s64 weighted_mean[]     = {  32, 27, 22, 19, 17, 15, 14 };
-       s64 weighted_stddev[]   = {  38, 35, 31, 27, 24, 21, 18 };
-
-       do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
-                       d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
-/* Test behaviour where we switch from one steady state to another: */
-static void mean_and_variance_test_2(struct kunit *test)
-{
-       s64 d[]                 = { 100, 100, 100, 100, 100 };
-       s64 mean[]              = {  22,  32,  40,  46,  50 };
-       s64 stddev[]            = {  32,  39,  42,  44,  45 };
-       s64 weighted_mean[]     = {  32,  49,  61,  71,  78 };
-       s64 weighted_stddev[]   = {  38,  44,  44,  41,  38 };
-
-       do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
-                       d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
-static void mean_and_variance_fast_divpow2(struct kunit *test)
-{
-       s64 i;
-       u8 d;
-
-       for (i = 0; i < 100; i++) {
-               d = 0;
-               KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d));
-               KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d));
-               for (d = 1; d < 32; d++) {
-                       KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)),
-                                           div_u64(i, 1 << d), "%lld %u", i, d);
-                       KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)),
-                                           div_u64(i, 1 << d), "%lld %u", -i, d);
-               }
-       }
-}
-
-static void mean_and_variance_u128_basic_test(struct kunit *test)
-{
-       u128_u a  = u64s_to_u128(0, U64_MAX);
-       u128_u a1 = u64s_to_u128(0, 1);
-       u128_u b  = u64s_to_u128(1, 0);
-       u128_u c  = u64s_to_u128(0, 1LLU << 63);
-       u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX);
-
-       KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1);
-       KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0);
-       KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1);
-       KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0);
-
-       KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX);
-       KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0);
-
-       KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1);
-       KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0);
-
-       KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1);
-       KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1);
-
-       KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63);
-
-       KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1);
-       KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX);
-
-       KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1);
-       KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31);
-}
-
-static struct kunit_case mean_and_variance_test_cases[] = {
-       KUNIT_CASE(mean_and_variance_fast_divpow2),
-       KUNIT_CASE(mean_and_variance_u128_basic_test),
-       KUNIT_CASE(mean_and_variance_basic_test),
-       KUNIT_CASE(mean_and_variance_weighted_test),
-       KUNIT_CASE(mean_and_variance_weighted_advanced_test),
-       KUNIT_CASE(mean_and_variance_test_1),
-       KUNIT_CASE(mean_and_variance_test_2),
-       {}
-};
-
-static struct kunit_suite mean_and_variance_test_suite = {
-       .name           = "mean and variance tests",
-       .test_cases     = mean_and_variance_test_cases
-};
-
-kunit_test_suite(mean_and_variance_test_suite);
-
-MODULE_AUTHOR("Daniel B. Hill");
-MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests");
-MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
deleted file mode 100644 (file)
index f296cce..0000000
+++ /dev/null
@@ -1,277 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for moving data off a device.
- */
-
-#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "ec.h"
-#include "errcode.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal.h"
-#include "keylist.h"
-#include "migrate.h"
-#include "move.h"
-#include "progress.h"
-#include "replicas.h"
-#include "super-io.h"
-
-static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
-                        unsigned dev_idx, unsigned flags, bool metadata)
-{
-       unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
-       unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
-       unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
-       unsigned nr_good;
-
-       bch2_bkey_drop_device(k, dev_idx);
-
-       nr_good = bch2_bkey_durability(c, k.s_c);
-       if ((!nr_good && !(flags & lost)) ||
-           (nr_good < replicas && !(flags & degraded)))
-               return bch_err_throw(c, remove_would_lose_data);
-
-       return 0;
-}
-
-static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter,
-                          struct btree *b, unsigned dev_idx, unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_buf k;
-
-       bch2_bkey_buf_init(&k);
-       bch2_bkey_buf_copy(&k, c, &b->key);
-
-       int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?:
-               bch2_btree_node_update_key(trans, iter, b, k.k, 0, false);
-
-       bch_err_fn(c, ret);
-       bch2_bkey_buf_exit(&k, c);
-       return ret;
-}
-
-static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
-                                    struct btree_iter *iter,
-                                    struct bkey_s_c k,
-                                    unsigned dev_idx,
-                                    unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_i *n;
-       int ret;
-
-       if (!bch2_bkey_has_device_c(k, dev_idx))
-               return 0;
-
-       n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node);
-       ret = PTR_ERR_OR_ZERO(n);
-       if (ret)
-               return ret;
-
-       ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
-       if (ret)
-               return ret;
-
-       /*
-        * If the new extent no longer has any pointers, bch2_extent_normalize()
-        * will do the appropriate thing with it (turning it into a
-        * KEY_TYPE_error key, or just a discard if it was a cached extent)
-        */
-       bch2_extent_normalize(c, bkey_i_to_s(n));
-
-       /*
-        * Since we're not inserting through an extent iterator
-        * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
-        * we aren't using the extent overwrite path to delete, we're
-        * just using the normal key deletion path:
-        */
-       if (bkey_deleted(&n->k))
-               n->k.size = 0;
-       return 0;
-}
-
-static int bch2_dev_btree_drop_key(struct btree_trans *trans,
-                                  struct bkey_s_c_backpointer bp,
-                                  unsigned dev_idx,
-                                  struct bkey_buf *last_flushed,
-                                  unsigned flags)
-{
-       struct btree_iter iter;
-       struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed);
-       int ret = PTR_ERR_OR_ZERO(b);
-       if (ret)
-               return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret;
-
-       ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
-
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int bch2_dev_usrdata_drop(struct bch_fs *c,
-                                struct progress_indicator_state *progress,
-                                unsigned dev_idx, unsigned flags)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       enum btree_id id;
-       int ret = 0;
-
-       for (id = 0; id < BTREE_ID_NR; id++) {
-               if (!btree_type_has_ptrs(id))
-                       continue;
-
-               ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
-                               BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-                       bch2_progress_update_iter(trans, progress, &iter, "dropping user data");
-                       bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
-               }));
-               if (ret)
-                       break;
-       }
-
-       bch2_trans_put(trans);
-
-       return ret;
-}
-
-static int bch2_dev_metadata_drop(struct bch_fs *c,
-                                 struct progress_indicator_state *progress,
-                                 unsigned dev_idx, unsigned flags)
-{
-       struct btree_trans *trans;
-       struct btree_iter iter;
-       struct closure cl;
-       struct btree *b;
-       struct bkey_buf k;
-       unsigned id;
-       int ret;
-
-       /* don't handle this yet: */
-       if (flags & BCH_FORCE_IF_METADATA_LOST)
-               return bch_err_throw(c, remove_with_metadata_missing_unimplemented);
-
-       trans = bch2_trans_get(c);
-       bch2_bkey_buf_init(&k);
-       closure_init_stack(&cl);
-
-       for (id = 0; id < BTREE_ID_NR; id++) {
-               bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
-                                         BTREE_ITER_prefetch);
-retry:
-               ret = 0;
-               while (bch2_trans_begin(trans),
-                      (b = bch2_btree_iter_peek_node(trans, &iter)) &&
-                      !(ret = PTR_ERR_OR_ZERO(b))) {
-                       bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
-
-                       if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
-                               goto next;
-
-                       ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
-                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-                               ret = 0;
-                               continue;
-                       }
-
-                       if (ret)
-                               break;
-next:
-                       bch2_btree_iter_next_node(trans, &iter);
-               }
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       goto retry;
-
-               bch2_trans_iter_exit(trans, &iter);
-
-               if (ret)
-                       goto err;
-       }
-
-       bch2_btree_interior_updates_flush(c);
-       ret = 0;
-err:
-       bch2_bkey_buf_exit(&k, c);
-       bch2_trans_put(trans);
-
-       BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
-       return ret;
-}
-
-static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx,
-                       struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed,
-                       unsigned flags)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent,
-                                                    last_flushed);
-       int ret = bkey_err(k);
-       if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
-               return 0;
-       if (ret)
-               return ret;
-
-       if (!k.k || !bch2_bkey_has_device_c(k, dev_idx))
-               goto out;
-
-       /*
-        * XXX: pass flags arg to invalidate_stripe_to_dev and handle it
-        * properly
-        */
-
-       if (bkey_is_btree_ptr(k.k))
-               ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags);
-       else if (k.k->type == KEY_TYPE_stripe)
-               ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags);
-       else
-               ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-
-       struct bkey_buf last_flushed;
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-
-       int ret = bch2_btree_write_buffer_flush_sync(trans) ?:
-               for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
-                               POS(dev_idx, 0),
-                               POS(dev_idx, U64_MAX), 0, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-                       if (k.k->type != KEY_TYPE_backpointer)
-                               continue;
-
-                       data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k),
-                                    &last_flushed, flags);
-
-       }));
-
-       bch2_bkey_buf_exit(&last_flushed, trans->c);
-       bch2_trans_put(trans);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags)
-{
-       struct progress_indicator_state progress;
-       bch2_progress_init(&progress, c,
-                          BIT_ULL(BTREE_ID_extents)|
-                          BIT_ULL(BTREE_ID_reflink));
-
-       return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?:
-               bch2_dev_metadata_drop(c, &progress, dev_idx, flags);
-}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
deleted file mode 100644 (file)
index 3001814..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MIGRATE_H
-#define _BCACHEFS_MIGRATE_H
-
-int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned);
-int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned);
-
-#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
deleted file mode 100644 (file)
index eec591e..0000000
+++ /dev/null
@@ -1,1494 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "move.h"
-#include "rebalance.h"
-#include "reflink.h"
-#include "replicas.h"
-#include "snapshot.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-#include <linux/kthread.h>
-
-const char * const bch2_data_ops_strs[] = {
-#define x(t, n, ...) [n] = #t,
-       BCH_DATA_OPS()
-#undef x
-       NULL
-};
-
-struct evacuate_bucket_arg {
-       struct bpos             bucket;
-       int                     gen;
-       struct data_update_opts data_opts;
-};
-
-static bool evacuate_bucket_pred(struct bch_fs *, void *,
-                                enum btree_id, struct bkey_s_c,
-                                struct bch_io_opts *,
-                                struct data_update_opts *);
-
-static noinline void
-trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
-              struct bch_io_opts *io_opts,
-              struct data_update_opts *data_opts)
-{
-       struct printbuf buf = PRINTBUF;
-
-       bch2_bkey_val_to_text(&buf, c, k);
-       prt_newline(&buf);
-       bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
-       trace_io_move(c, buf.buf);
-       printbuf_exit(&buf);
-}
-
-static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct printbuf buf = PRINTBUF;
-
-       bch2_bkey_val_to_text(&buf, c, k);
-       trace_io_move_read(c, buf.buf);
-       printbuf_exit(&buf);
-}
-
-static noinline void
-trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
-                   struct bch_io_opts *io_opts,
-                   struct data_update_opts *data_opts,
-                   move_pred_fn pred, void *_arg, bool p)
-{
-       struct printbuf buf = PRINTBUF;
-
-       prt_printf(&buf, "%ps: %u", pred, p);
-
-       if (pred == evacuate_bucket_pred) {
-               struct evacuate_bucket_arg *arg = _arg;
-               prt_printf(&buf, " gen=%u", arg->gen);
-       }
-
-       prt_newline(&buf);
-       bch2_bkey_val_to_text(&buf, c, k);
-       prt_newline(&buf);
-       bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
-       trace_io_move_pred(c, buf.buf);
-       printbuf_exit(&buf);
-}
-
-static noinline void
-trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
-{
-       struct printbuf buf = PRINTBUF;
-
-       prt_printf(&buf, "bucket: ");
-       bch2_bpos_to_text(&buf, bucket);
-       prt_printf(&buf, " gen: %i\n", gen);
-
-       trace_io_move_evacuate_bucket(c, buf.buf);
-       printbuf_exit(&buf);
-}
-
-struct moving_io {
-       struct list_head                read_list;
-       struct list_head                io_list;
-       struct move_bucket              *b;
-       struct closure                  cl;
-       bool                            read_completed;
-
-       unsigned                        read_sectors;
-       unsigned                        write_sectors;
-
-       struct data_update              write;
-};
-
-static void move_free(struct moving_io *io)
-{
-       struct moving_context *ctxt = io->write.ctxt;
-
-       if (io->b)
-               atomic_dec(&io->b->count);
-
-       mutex_lock(&ctxt->lock);
-       list_del(&io->io_list);
-       wake_up(&ctxt->wait);
-       mutex_unlock(&ctxt->lock);
-
-       if (!io->write.data_opts.scrub) {
-               bch2_data_update_exit(&io->write);
-       } else {
-               bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
-               kfree(io->write.bvecs);
-       }
-       kfree(io);
-}
-
-static void move_write_done(struct bch_write_op *op)
-{
-       struct moving_io *io = container_of(op, struct moving_io, write.op);
-       struct bch_fs *c = op->c;
-       struct moving_context *ctxt = io->write.ctxt;
-
-       if (op->error) {
-               if (trace_io_move_write_fail_enabled()) {
-                       struct printbuf buf = PRINTBUF;
-
-                       bch2_write_op_to_text(&buf, op);
-                       trace_io_move_write_fail(c, buf.buf);
-                       printbuf_exit(&buf);
-               }
-               this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
-
-               ctxt->write_error = true;
-       }
-
-       atomic_sub(io->write_sectors, &ctxt->write_sectors);
-       atomic_dec(&ctxt->write_ios);
-       move_free(io);
-       closure_put(&ctxt->cl);
-}
-
-static void move_write(struct moving_io *io)
-{
-       struct bch_fs *c = io->write.op.c;
-       struct moving_context *ctxt = io->write.ctxt;
-       struct bch_read_bio *rbio = &io->write.rbio;
-
-       if (ctxt->stats) {
-               if (rbio->bio.bi_status)
-                       atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
-                                    &ctxt->stats->sectors_error_uncorrected);
-               else if (rbio->saw_error)
-                       atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
-                                    &ctxt->stats->sectors_error_corrected);
-       }
-
-       /*
-        * If the extent has been bitrotted, we're going to have to give it a
-        * new checksum in order to move it - but the poison bit will ensure
-        * that userspace still gets the appropriate error.
-        */
-       if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
-                    (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
-               struct bch_extent_crc_unpacked crc = rbio->pick.crc;
-               struct nonce nonce = extent_nonce(rbio->version, crc);
-
-               rbio->pick.crc.csum     = bch2_checksum_bio(c, rbio->pick.crc.csum_type,
-                                                           nonce, &rbio->bio);
-               rbio->ret               = 0;
-       }
-
-       if (unlikely(rbio->ret || io->write.data_opts.scrub)) {
-               move_free(io);
-               return;
-       }
-
-       if (trace_io_move_write_enabled()) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
-               trace_io_move_write(c, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       closure_get(&io->write.ctxt->cl);
-       atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
-       atomic_inc(&io->write.ctxt->write_ios);
-
-       bch2_data_update_read_done(&io->write);
-}
-
-struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
-{
-       struct moving_io *io =
-               list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
-
-       return io && io->read_completed ? io : NULL;
-}
-
-static void move_read_endio(struct bio *bio)
-{
-       struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
-       struct moving_context *ctxt = io->write.ctxt;
-
-       atomic_sub(io->read_sectors, &ctxt->read_sectors);
-       atomic_dec(&ctxt->read_ios);
-       io->read_completed = true;
-
-       wake_up(&ctxt->wait);
-       closure_put(&ctxt->cl);
-}
-
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
-{
-       struct moving_io *io;
-
-       while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
-               bch2_trans_unlock_long(ctxt->trans);
-               list_del(&io->read_list);
-               move_write(io);
-       }
-}
-
-void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
-{
-       unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
-
-       move_ctxt_wait_event(ctxt,
-               !atomic_read(&ctxt->write_sectors) ||
-               atomic_read(&ctxt->write_sectors) != sectors_pending);
-}
-
-void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
-{
-       move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
-       bch2_trans_unlock_long(ctxt->trans);
-       closure_sync(&ctxt->cl);
-}
-
-void bch2_moving_ctxt_exit(struct moving_context *ctxt)
-{
-       struct bch_fs *c = ctxt->trans->c;
-
-       bch2_moving_ctxt_flush_all(ctxt);
-
-       EBUG_ON(atomic_read(&ctxt->write_sectors));
-       EBUG_ON(atomic_read(&ctxt->write_ios));
-       EBUG_ON(atomic_read(&ctxt->read_sectors));
-       EBUG_ON(atomic_read(&ctxt->read_ios));
-
-       mutex_lock(&c->moving_context_lock);
-       list_del(&ctxt->list);
-       mutex_unlock(&c->moving_context_lock);
-
-       /*
-        * Generally, releasing a transaction within a transaction restart means
-        * an unhandled transaction restart: but this can happen legitimately
-        * within the move code, e.g. when bch2_move_ratelimit() tells us to
-        * exit before we've retried
-        */
-       bch2_trans_begin(ctxt->trans);
-       bch2_trans_put(ctxt->trans);
-       memset(ctxt, 0, sizeof(*ctxt));
-}
-
-void bch2_moving_ctxt_init(struct moving_context *ctxt,
-                          struct bch_fs *c,
-                          struct bch_ratelimit *rate,
-                          struct bch_move_stats *stats,
-                          struct write_point_specifier wp,
-                          bool wait_on_copygc)
-{
-       memset(ctxt, 0, sizeof(*ctxt));
-
-       ctxt->trans     = bch2_trans_get(c);
-       ctxt->fn        = (void *) _RET_IP_;
-       ctxt->rate      = rate;
-       ctxt->stats     = stats;
-       ctxt->wp        = wp;
-       ctxt->wait_on_copygc = wait_on_copygc;
-
-       closure_init_stack(&ctxt->cl);
-
-       mutex_init(&ctxt->lock);
-       INIT_LIST_HEAD(&ctxt->reads);
-       INIT_LIST_HEAD(&ctxt->ios);
-       init_waitqueue_head(&ctxt->wait);
-
-       mutex_lock(&c->moving_context_lock);
-       list_add(&ctxt->list, &c->moving_context_list);
-       mutex_unlock(&c->moving_context_lock);
-}
-
-void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
-{
-       trace_move_data(c, stats);
-}
-
-void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
-{
-       memset(stats, 0, sizeof(*stats));
-       stats->data_type = BCH_DATA_user;
-       scnprintf(stats->name, sizeof(stats->name), "%s", name);
-}
-
-int bch2_move_extent(struct moving_context *ctxt,
-                    struct move_bucket *bucket_in_flight,
-                    struct btree_iter *iter,
-                    struct bkey_s_c k,
-                    struct bch_io_opts io_opts,
-                    struct data_update_opts data_opts)
-{
-       struct btree_trans *trans = ctxt->trans;
-       struct bch_fs *c = trans->c;
-       int ret = -ENOMEM;
-
-       if (trace_io_move_enabled())
-               trace_io_move2(c, k, &io_opts, &data_opts);
-       this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
-
-       if (ctxt->stats)
-               ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
-
-       bch2_data_update_opts_normalize(k, &data_opts);
-
-       if (!data_opts.rewrite_ptrs &&
-           !data_opts.extra_replicas &&
-           !data_opts.scrub) {
-               if (data_opts.kill_ptrs)
-                       return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
-               return 0;
-       }
-
-       struct moving_io *io = allocate_dropping_locks(trans, ret,
-                               kzalloc(sizeof(struct moving_io), _gfp));
-       if (!io)
-               goto err;
-
-       if (ret)
-               goto err_free;
-
-       INIT_LIST_HEAD(&io->io_list);
-       io->write.ctxt          = ctxt;
-       io->read_sectors        = k.k->size;
-       io->write_sectors       = k.k->size;
-
-       if (!data_opts.scrub) {
-               ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
-                                           &io_opts, data_opts, iter->btree_id, k);
-               if (ret)
-                       goto err_free;
-
-               io->write.op.end_io     = move_write_done;
-       } else {
-               bch2_bkey_buf_init(&io->write.k);
-               bch2_bkey_buf_reassemble(&io->write.k, c, k);
-
-               io->write.op.c          = c;
-               io->write.data_opts     = data_opts;
-
-               bch2_trans_unlock(trans);
-
-               ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
-               if (ret)
-                       goto err_free;
-       }
-
-       io->write.rbio.bio.bi_end_io = move_read_endio;
-       io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
-
-       if (ctxt->rate)
-               bch2_ratelimit_increment(ctxt->rate, k.k->size);
-
-       if (ctxt->stats) {
-               atomic64_inc(&ctxt->stats->keys_moved);
-               atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
-       }
-
-       if (bucket_in_flight) {
-               io->b = bucket_in_flight;
-               atomic_inc(&io->b->count);
-       }
-
-       if (trace_io_move_read_enabled())
-               trace_io_move_read2(c, k);
-
-       mutex_lock(&ctxt->lock);
-       atomic_add(io->read_sectors, &ctxt->read_sectors);
-       atomic_inc(&ctxt->read_ios);
-
-       list_add_tail(&io->read_list, &ctxt->reads);
-       list_add_tail(&io->io_list, &ctxt->ios);
-       mutex_unlock(&ctxt->lock);
-
-       /*
-        * dropped by move_read_endio() - guards against use after free of
-        * ctxt when doing wakeup
-        */
-       closure_get(&ctxt->cl);
-       __bch2_read_extent(trans, &io->write.rbio,
-                          io->write.rbio.bio.bi_iter,
-                          bkey_start_pos(k.k),
-                          iter->btree_id, k, 0,
-                          NULL,
-                          BCH_READ_last_fragment,
-                          data_opts.scrub ?  data_opts.read_dev : -1);
-       return 0;
-err_free:
-       kfree(io);
-err:
-       if (bch2_err_matches(ret, EROFS) ||
-           bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               return ret;
-
-       count_event(c, io_move_start_fail);
-
-       if (trace_io_move_start_fail_enabled()) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_bkey_val_to_text(&buf, c, k);
-               prt_str(&buf, ": ");
-               prt_str(&buf, bch2_err_str(ret));
-               trace_io_move_start_fail(c, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       if (bch2_err_matches(ret, BCH_ERR_data_update_done))
-               return 0;
-       return ret;
-}
-
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
-                         struct per_snapshot_io_opts *io_opts,
-                         struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
-                         struct btree_iter *extent_iter,
-                         struct bkey_s_c extent_k)
-{
-       struct bch_fs *c = trans->c;
-       u32 restart_count = trans->restart_count;
-       struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
-       int ret = 0;
-
-       if (extent_iter->min_depth)
-               return opts_ret;
-
-       if (extent_k.k->type == KEY_TYPE_reflink_v)
-               goto out;
-
-       if (io_opts->cur_inum != extent_pos.inode) {
-               io_opts->d.nr = 0;
-
-               ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode),
-                                        BTREE_ITER_all_snapshots, k, ({
-                       if (k.k->p.offset != extent_pos.inode)
-                               break;
-
-                       if (!bkey_is_inode(k.k))
-                               continue;
-
-                       struct bch_inode_unpacked inode;
-                       _ret3 = bch2_inode_unpack(k, &inode);
-                       if (_ret3)
-                               break;
-
-                       struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
-                       bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
-
-                       darray_push(&io_opts->d, e);
-               }));
-               io_opts->cur_inum = extent_pos.inode;
-       }
-
-       ret = ret ?: trans_was_restarted(trans, restart_count);
-       if (ret)
-               return ERR_PTR(ret);
-
-       if (extent_k.k->p.snapshot)
-               darray_for_each(io_opts->d, i)
-                       if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) {
-                               opts_ret = &i->io_opts;
-                               break;
-                       }
-out:
-       ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
-       if (ret)
-               return ERR_PTR(ret);
-       return opts_ret;
-}
-
-int bch2_move_get_io_opts_one(struct btree_trans *trans,
-                             struct bch_io_opts *io_opts,
-                             struct btree_iter *extent_iter,
-                             struct bkey_s_c extent_k)
-{
-       struct bch_fs *c = trans->c;
-
-       *io_opts = bch2_opts_to_inode_opts(c->opts);
-
-       /* reflink btree? */
-       if (!extent_k.k->p.inode)
-               goto out;
-
-       struct btree_iter inode_iter;
-       struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
-                              SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
-                              BTREE_ITER_cached);
-       int ret = bkey_err(inode_k);
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               return ret;
-
-       if (!ret && bkey_is_inode(inode_k.k)) {
-               struct bch_inode_unpacked inode;
-               bch2_inode_unpack(inode_k, &inode);
-               bch2_inode_opts_get(io_opts, c, &inode);
-       }
-       bch2_trans_iter_exit(trans, &inode_iter);
-       /* seem to be spinning here? */
-out:
-       return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
-}
-
-int bch2_move_ratelimit(struct moving_context *ctxt)
-{
-       struct bch_fs *c = ctxt->trans->c;
-       bool is_kthread = current->flags & PF_KTHREAD;
-       u64 delay;
-
-       if (ctxt->wait_on_copygc && c->copygc_running) {
-               bch2_moving_ctxt_flush_all(ctxt);
-               wait_event_killable(c->copygc_running_wq,
-                                   !c->copygc_running ||
-                                   (is_kthread && kthread_should_stop()));
-       }
-
-       do {
-               delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
-
-               if (is_kthread && kthread_should_stop())
-                       return 1;
-
-               if (delay)
-                       move_ctxt_wait_event_timeout(ctxt,
-                                       freezing(current) ||
-                                       (is_kthread && kthread_should_stop()),
-                                       delay);
-
-               if (unlikely(freezing(current))) {
-                       bch2_moving_ctxt_flush_all(ctxt);
-                       try_to_freeze();
-               }
-       } while (delay);
-
-       /*
-        * XXX: these limits really ought to be per device, SSDs and hard drives
-        * will want different limits
-        */
-       move_ctxt_wait_event(ctxt,
-               atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
-               atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
-               atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
-               atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
-
-       return 0;
-}
-
-/*
- * Move requires non extents iterators, and there's also no need for it to
- * signal indirect_extent_missing_error:
- */
-static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans,
-                                           struct btree_iter *iter,
-                                           struct bkey_s_c_reflink_p p)
-{
-       if (unlikely(REFLINK_P_ERROR(p.v)))
-               return bkey_s_c_null;
-
-       struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v));
-
-       bch2_trans_iter_init(trans, iter,
-                            BTREE_ID_reflink, reflink_pos,
-                            BTREE_ITER_not_extents);
-
-       struct bkey_s_c k = bch2_btree_iter_peek(trans, iter);
-       if (!k.k || bkey_err(k)) {
-               bch2_trans_iter_exit(trans, iter);
-               return k;
-       }
-
-       if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) {
-               bch2_trans_iter_exit(trans, iter);
-               return bkey_s_c_null;
-       }
-
-       return k;
-}
-
-int bch2_move_data_btree(struct moving_context *ctxt,
-                        struct bpos start,
-                        struct bpos end,
-                        move_pred_fn pred, void *arg,
-                        enum btree_id btree_id, unsigned level)
-{
-       struct btree_trans *trans = ctxt->trans;
-       struct bch_fs *c = trans->c;
-       struct per_snapshot_io_opts snapshot_io_opts;
-       struct bch_io_opts *io_opts;
-       struct bkey_buf sk;
-       struct btree_iter iter, reflink_iter = {};
-       struct bkey_s_c k;
-       struct data_update_opts data_opts;
-       /*
-        * If we're moving a single file, also process reflinked data it points
-        * to (this includes propagating changed io_opts from the inode to the
-        * extent):
-        */
-       bool walk_indirect = start.inode == end.inode;
-       int ret = 0, ret2;
-
-       per_snapshot_io_opts_init(&snapshot_io_opts, c);
-       bch2_bkey_buf_init(&sk);
-
-       if (ctxt->stats) {
-               ctxt->stats->data_type  = BCH_DATA_user;
-               ctxt->stats->pos        = BBPOS(btree_id, start);
-       }
-
-retry_root:
-       bch2_trans_begin(trans);
-
-       if (level == bch2_btree_id_root(c, btree_id)->level + 1) {
-               bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1,
-                                         BTREE_ITER_prefetch|
-                                         BTREE_ITER_not_extents|
-                                         BTREE_ITER_all_snapshots);
-               struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
-               ret = PTR_ERR_OR_ZERO(b);
-               if (ret)
-                       goto root_err;
-
-               if (b != btree_node_root(c, b)) {
-                       bch2_trans_iter_exit(trans, &iter);
-                       goto retry_root;
-               }
-
-               k = bkey_i_to_s_c(&b->key);
-
-               io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
-                                               iter.pos, &iter, k);
-               ret = PTR_ERR_OR_ZERO(io_opts);
-               if (ret)
-                       goto root_err;
-
-               memset(&data_opts, 0, sizeof(data_opts));
-               if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts))
-                       goto out;
-
-
-               if (!data_opts.scrub)
-                       ret = bch2_btree_node_rewrite_pos(trans, btree_id, level,
-                                                         k.k->p, data_opts.target, 0);
-               else
-                       ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
-
-root_err:
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-                       bch2_trans_iter_exit(trans, &iter);
-                       goto retry_root;
-               }
-
-               goto out;
-       }
-
-       bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level,
-                                 BTREE_ITER_prefetch|
-                                 BTREE_ITER_not_extents|
-                                 BTREE_ITER_all_snapshots);
-
-       if (ctxt->rate)
-               bch2_ratelimit_reset(ctxt->rate);
-
-       while (!bch2_move_ratelimit(ctxt)) {
-               struct btree_iter *extent_iter = &iter;
-
-               bch2_trans_begin(trans);
-
-               k = bch2_btree_iter_peek(trans, &iter);
-               if (!k.k)
-                       break;
-
-               ret = bkey_err(k);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       break;
-
-               if (bkey_gt(bkey_start_pos(k.k), end))
-                       break;
-
-               if (ctxt->stats)
-                       ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
-
-               if (walk_indirect &&
-                   k.k->type == KEY_TYPE_reflink_p &&
-                   REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
-                       struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-
-                       bch2_trans_iter_exit(trans, &reflink_iter);
-                       k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p);
-                       ret = bkey_err(k);
-                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                               continue;
-                       if (ret)
-                               break;
-
-                       if (!k.k)
-                               goto next_nondata;
-
-                       /*
-                        * XXX: reflink pointers may point to multiple indirect
-                        * extents, so don't advance past the entire reflink
-                        * pointer - need to fixup iter->k
-                        */
-                       extent_iter = &reflink_iter;
-               }
-
-               if (!bkey_extent_is_direct_data(k.k))
-                       goto next_nondata;
-
-               io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
-                                               iter.pos, extent_iter, k);
-               ret = PTR_ERR_OR_ZERO(io_opts);
-               if (ret)
-                       continue;
-
-               memset(&data_opts, 0, sizeof(data_opts));
-               if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts))
-                       goto next;
-
-               /*
-                * The iterator gets unlocked by __bch2_read_extent - need to
-                * save a copy of @k elsewhere:
-                */
-               bch2_bkey_buf_reassemble(&sk, c, k);
-               k = bkey_i_to_s_c(sk.k);
-
-               if (!level)
-                       ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
-               else if (!data_opts.scrub)
-                       ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level,
-                                                         k.k->p, data_opts.target, 0);
-               else
-                       ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
-
-               if (ret2) {
-                       if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
-                               continue;
-
-                       if (bch2_err_matches(ret2, ENOMEM)) {
-                               /* memory allocation failure, wait for some IO to finish */
-                               bch2_move_ctxt_wait_for_io(ctxt);
-                               continue;
-                       }
-
-                       /* XXX signal failure */
-                       goto next;
-               }
-next:
-               if (ctxt->stats)
-                       atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
-next_nondata:
-               if (!bch2_btree_iter_advance(trans, &iter))
-                       break;
-       }
-out:
-       bch2_trans_iter_exit(trans, &reflink_iter);
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_bkey_buf_exit(&sk, c);
-       per_snapshot_io_opts_exit(&snapshot_io_opts);
-
-       return ret;
-}
-
-int __bch2_move_data(struct moving_context *ctxt,
-                    struct bbpos start,
-                    struct bbpos end,
-                    move_pred_fn pred, void *arg)
-{
-       struct bch_fs *c = ctxt->trans->c;
-       enum btree_id id;
-       int ret = 0;
-
-       for (id = start.btree;
-            id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
-            id++) {
-               ctxt->stats->pos = BBPOS(id, POS_MIN);
-
-               if (!btree_type_has_ptrs(id) ||
-                   !bch2_btree_id_root(c, id)->b)
-                       continue;
-
-               ret = bch2_move_data_btree(ctxt,
-                                      id == start.btree ? start.pos : POS_MIN,
-                                      id == end.btree   ? end.pos   : POS_MAX,
-                                      pred, arg, id, 0);
-               if (ret)
-                       break;
-       }
-
-       return ret;
-}
-
-int bch2_move_data(struct bch_fs *c,
-                  struct bbpos start,
-                  struct bbpos end,
-                  struct bch_ratelimit *rate,
-                  struct bch_move_stats *stats,
-                  struct write_point_specifier wp,
-                  bool wait_on_copygc,
-                  move_pred_fn pred, void *arg)
-{
-       struct moving_context ctxt;
-
-       bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-       int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
-       bch2_moving_ctxt_exit(&ctxt);
-
-       return ret;
-}
-
-static int __bch2_move_data_phys(struct moving_context *ctxt,
-                       struct move_bucket *bucket_in_flight,
-                       unsigned dev,
-                       u64 bucket_start,
-                       u64 bucket_end,
-                       unsigned data_types,
-                       bool copygc,
-                       move_pred_fn pred, void *arg)
-{
-       struct btree_trans *trans = ctxt->trans;
-       struct bch_fs *c = trans->c;
-       bool is_kthread = current->flags & PF_KTHREAD;
-       struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-       struct btree_iter iter = {}, bp_iter = {};
-       struct bkey_buf sk;
-       struct bkey_s_c k;
-       struct bkey_buf last_flushed;
-       u64 check_mismatch_done = bucket_start;
-       int ret = 0;
-
-       struct bch_dev *ca = bch2_dev_tryget(c, dev);
-       if (!ca)
-               return 0;
-
-       bucket_end = min(bucket_end, ca->mi.nbuckets);
-
-       struct bpos bp_start    = bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
-       struct bpos bp_end      = bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
-
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-       bch2_bkey_buf_init(&sk);
-
-       /*
-        * We're not run in a context that handles transaction restarts:
-        */
-       bch2_trans_begin(trans);
-
-       bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
-
-       ret = bch2_btree_write_buffer_tryflush(trans);
-       if (!bch2_err_matches(ret, EROFS))
-               bch_err_msg(c, ret, "flushing btree write buffer");
-       if (ret)
-               goto err;
-
-       while (!(ret = bch2_move_ratelimit(ctxt))) {
-               if (is_kthread && kthread_should_stop())
-                       break;
-
-               bch2_trans_begin(trans);
-
-               k = bch2_btree_iter_peek(trans, &bp_iter);
-               ret = bkey_err(k);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       goto err;
-
-               if (!k.k || bkey_gt(k.k->p, bp_end))
-                       break;
-
-               if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
-                       while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
-                               bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
-                                                                      copygc, &last_flushed);
-                       }
-                       continue;
-               }
-
-               if (k.k->type != KEY_TYPE_backpointer)
-                       goto next;
-
-               struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
-               if (ctxt->stats)
-                       ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
-               if (!(data_types & BIT(bp.v->data_type)))
-                       goto next;
-
-               if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes)
-                       goto next;
-
-               k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
-               ret = bkey_err(k);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       goto err;
-               if (!k.k)
-                       goto next;
-
-               if (!bp.v->level) {
-                       ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
-                       if (ret) {
-                               bch2_trans_iter_exit(trans, &iter);
-                               continue;
-                       }
-               }
-
-               struct data_update_opts data_opts = {};
-               bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts);
-
-               if (trace_io_move_pred_enabled())
-                       trace_io_move_pred2(c, k, &io_opts, &data_opts,
-                                           pred, arg, p);
-
-               if (!p) {
-                       bch2_trans_iter_exit(trans, &iter);
-                       goto next;
-               }
-
-               if (data_opts.scrub &&
-                   !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
-                       bch2_trans_iter_exit(trans, &iter);
-                       ret = bch_err_throw(c, device_offline);
-                       break;
-               }
-
-               bch2_bkey_buf_reassemble(&sk, c, k);
-               k = bkey_i_to_s_c(sk.k);
-
-               /* move_extent will drop locks */
-               unsigned sectors = bp.v->bucket_len;
-
-               if (!bp.v->level)
-                       ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
-               else if (!data_opts.scrub)
-                       ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level,
-                                                         k.k->p, data_opts.target, 0);
-               else
-                       ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
-
-               bch2_trans_iter_exit(trans, &iter);
-
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret == -ENOMEM) {
-                       /* memory allocation failure, wait for some IO to finish */
-                       bch2_move_ctxt_wait_for_io(ctxt);
-                       continue;
-               }
-               if (ret)
-                       goto err;
-
-               if (ctxt->stats)
-                       atomic64_add(sectors, &ctxt->stats->sectors_seen);
-next:
-               bch2_btree_iter_advance(trans, &bp_iter);
-       }
-
-       while (check_mismatch_done < bucket_end)
-               bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
-                                                      copygc, &last_flushed);
-err:
-       bch2_trans_iter_exit(trans, &bp_iter);
-       bch2_bkey_buf_exit(&sk, c);
-       bch2_bkey_buf_exit(&last_flushed, c);
-       bch2_dev_put(ca);
-       return ret;
-}
-
-int bch2_move_data_phys(struct bch_fs *c,
-                       unsigned dev,
-                       u64 start,
-                       u64 end,
-                       unsigned data_types,
-                       struct bch_ratelimit *rate,
-                       struct bch_move_stats *stats,
-                       struct write_point_specifier wp,
-                       bool wait_on_copygc,
-                       move_pred_fn pred, void *arg)
-{
-       struct moving_context ctxt;
-
-       bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
-
-       bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-       if (ctxt.stats) {
-               ctxt.stats->phys = true;
-               ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
-       }
-
-       int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end,
-                                       data_types, false, pred, arg);
-       bch2_moving_ctxt_exit(&ctxt);
-
-       return ret;
-}
-
-static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg,
-                                enum btree_id btree, struct bkey_s_c k,
-                                struct bch_io_opts *io_opts,
-                                struct data_update_opts *data_opts)
-{
-       struct evacuate_bucket_arg *arg = _arg;
-
-       *data_opts = arg->data_opts;
-
-       unsigned i = 0;
-       bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
-               if (ptr->dev == arg->bucket.inode &&
-                   (arg->gen < 0 || arg->gen == ptr->gen) &&
-                   !ptr->cached)
-                       data_opts->rewrite_ptrs |= BIT(i);
-               i++;
-       }
-
-       return data_opts->rewrite_ptrs != 0;
-}
-
-int bch2_evacuate_bucket(struct moving_context *ctxt,
-                        struct move_bucket *bucket_in_flight,
-                        struct bpos bucket, int gen,
-                        struct data_update_opts data_opts)
-{
-       struct bch_fs *c = ctxt->trans->c;
-       struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
-
-       count_event(c, io_move_evacuate_bucket);
-       if (trace_io_move_evacuate_bucket_enabled())
-               trace_io_move_evacuate_bucket2(c, bucket, gen);
-
-       return __bch2_move_data_phys(ctxt, bucket_in_flight,
-                                  bucket.inode,
-                                  bucket.offset,
-                                  bucket.offset + 1,
-                                  ~0,
-                                  true,
-                                  evacuate_bucket_pred, &arg);
-}
-
-typedef bool (*move_btree_pred)(struct bch_fs *, void *,
-                               struct btree *, struct bch_io_opts *,
-                               struct data_update_opts *);
-
-static int bch2_move_btree(struct bch_fs *c,
-                          struct bbpos start,
-                          struct bbpos end,
-                          move_btree_pred pred, void *arg,
-                          struct bch_move_stats *stats)
-{
-       bool kthread = (current->flags & PF_KTHREAD) != 0;
-       struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-       struct moving_context ctxt;
-       struct btree_trans *trans;
-       struct btree_iter iter;
-       struct btree *b;
-       enum btree_id btree;
-       struct data_update_opts data_opts;
-       int ret = 0;
-
-       bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
-                             writepoint_ptr(&c->btree_write_point),
-                             true);
-       trans = ctxt.trans;
-
-       stats->data_type = BCH_DATA_btree;
-
-       for (btree = start.btree;
-            btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
-            btree ++) {
-               stats->pos = BBPOS(btree, POS_MIN);
-
-               if (!bch2_btree_id_root(c, btree)->b)
-                       continue;
-
-               bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
-                                         BTREE_ITER_prefetch);
-retry:
-               ret = 0;
-               while (bch2_trans_begin(trans),
-                      (b = bch2_btree_iter_peek_node(trans, &iter)) &&
-                      !(ret = PTR_ERR_OR_ZERO(b))) {
-                       if (kthread && kthread_should_stop())
-                               break;
-
-                       if ((cmp_int(btree, end.btree) ?:
-                            bpos_cmp(b->key.k.p, end.pos)) > 0)
-                               break;
-
-                       stats->pos = BBPOS(iter.btree_id, iter.pos);
-
-                       if (!pred(c, arg, b, &io_opts, &data_opts))
-                               goto next;
-
-                       ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret;
-                       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                               continue;
-                       if (ret)
-                               break;
-next:
-                       bch2_btree_iter_next_node(trans, &iter);
-               }
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       goto retry;
-
-               bch2_trans_iter_exit(trans, &iter);
-
-               if (kthread && kthread_should_stop())
-                       break;
-       }
-
-       bch_err_fn(c, ret);
-       bch2_moving_ctxt_exit(&ctxt);
-       bch2_btree_interior_updates_flush(c);
-
-       return ret;
-}
-
-static bool rereplicate_pred(struct bch_fs *c, void *arg,
-                            enum btree_id btree, struct bkey_s_c k,
-                            struct bch_io_opts *io_opts,
-                            struct data_update_opts *data_opts)
-{
-       unsigned nr_good = bch2_bkey_durability(c, k);
-       unsigned replicas = bkey_is_btree_ptr(k.k)
-               ? c->opts.metadata_replicas
-               : io_opts->data_replicas;
-
-       guard(rcu)();
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       unsigned i = 0;
-       bkey_for_each_ptr(ptrs, ptr) {
-               struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-               if (!ptr->cached &&
-                   (!ca || !ca->mi.durability))
-                       data_opts->kill_ptrs |= BIT(i);
-               i++;
-       }
-
-       if (!data_opts->kill_ptrs &&
-           (!nr_good || nr_good >= replicas))
-               return false;
-
-       data_opts->target               = 0;
-       data_opts->extra_replicas       = replicas - nr_good;
-       data_opts->btree_insert_flags   = 0;
-       return true;
-}
-
-static bool migrate_pred(struct bch_fs *c, void *arg,
-                        enum btree_id btree, struct bkey_s_c k,
-                        struct bch_io_opts *io_opts,
-                        struct data_update_opts *data_opts)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       struct bch_ioctl_data *op = arg;
-       unsigned i = 0;
-
-       data_opts->rewrite_ptrs         = 0;
-       data_opts->target               = 0;
-       data_opts->extra_replicas       = 0;
-       data_opts->btree_insert_flags   = 0;
-
-       bkey_for_each_ptr(ptrs, ptr) {
-               if (ptr->dev == op->migrate.dev)
-                       data_opts->rewrite_ptrs |= 1U << i;
-               i++;
-       }
-
-       return data_opts->rewrite_ptrs != 0;
-}
-
-static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
-                                  struct btree *b,
-                                  struct bch_io_opts *io_opts,
-                                  struct data_update_opts *data_opts)
-{
-       return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts);
-}
-
-/*
- * Ancient versions of bcachefs produced packed formats which could represent
- * keys that the in memory format cannot represent; this checks for those
- * formats so we can get rid of them.
- */
-static bool bformat_needs_redo(struct bkey_format *f)
-{
-       for (unsigned i = 0; i < f->nr_fields; i++)
-               if (bch2_bkey_format_field_overflows(f, i))
-                       return true;
-
-       return false;
-}
-
-static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
-                                  struct btree *b,
-                                  struct bch_io_opts *io_opts,
-                                  struct data_update_opts *data_opts)
-{
-       if (b->version_ondisk != c->sb.version ||
-           btree_node_need_rewrite(b) ||
-           bformat_needs_redo(&b->format)) {
-               data_opts->target               = 0;
-               data_opts->extra_replicas       = 0;
-               data_opts->btree_insert_flags   = 0;
-               return true;
-       }
-
-       return false;
-}
-
-int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
-{
-       int ret;
-
-       ret = bch2_move_btree(c,
-                             BBPOS_MIN,
-                             BBPOS_MAX,
-                             rewrite_old_nodes_pred, c, stats);
-       if (!ret) {
-               mutex_lock(&c->sb_lock);
-               c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
-               c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
-               c->disk_sb.sb->version_min = c->disk_sb.sb->version;
-               bch2_write_super(c);
-               mutex_unlock(&c->sb_lock);
-       }
-
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
-                            enum btree_id btree, struct bkey_s_c k,
-                            struct bch_io_opts *io_opts,
-                            struct data_update_opts *data_opts)
-{
-       unsigned durability = bch2_bkey_durability(c, k);
-       unsigned replicas = bkey_is_btree_ptr(k.k)
-               ? c->opts.metadata_replicas
-               : io_opts->data_replicas;
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       unsigned i = 0;
-
-       guard(rcu)();
-       bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
-               unsigned d = bch2_extent_ptr_durability(c, &p);
-
-               if (d && durability - d >= replicas) {
-                       data_opts->kill_ptrs |= BIT(i);
-                       durability -= d;
-               }
-
-               i++;
-       }
-
-       return data_opts->kill_ptrs != 0;
-}
-
-static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
-                                  struct btree *b,
-                                  struct bch_io_opts *io_opts,
-                                  struct data_update_opts *data_opts)
-{
-       return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key),
-                                       io_opts, data_opts);
-}
-
-static bool scrub_pred(struct bch_fs *c, void *_arg,
-                      enum btree_id btree, struct bkey_s_c k,
-                      struct bch_io_opts *io_opts,
-                      struct data_update_opts *data_opts)
-{
-       struct bch_ioctl_data *arg = _arg;
-
-       if (k.k->type != KEY_TYPE_btree_ptr_v2) {
-               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-               const union bch_extent_entry *entry;
-               struct extent_ptr_decoded p;
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-                       if (p.ptr.dev == arg->migrate.dev) {
-                               if (!p.crc.csum_type)
-                                       return false;
-                               break;
-                       }
-       }
-
-       data_opts->scrub        = true;
-       data_opts->read_dev     = arg->migrate.dev;
-       return true;
-}
-
-int bch2_data_job(struct bch_fs *c,
-                 struct bch_move_stats *stats,
-                 struct bch_ioctl_data op)
-{
-       struct bbpos start      = BBPOS(op.start_btree, op.start_pos);
-       struct bbpos end        = BBPOS(op.end_btree, op.end_pos);
-       int ret = 0;
-
-       if (op.op >= BCH_DATA_OP_NR)
-               return -EINVAL;
-
-       bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
-
-       switch (op.op) {
-       case BCH_DATA_OP_scrub:
-               /*
-                * prevent tests from spuriously failing, make sure we see all
-                * btree nodes that need to be repaired
-                */
-               bch2_btree_interior_updates_flush(c);
-
-               ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
-                                         op.scrub.data_types,
-                                         NULL,
-                                         stats,
-                                         writepoint_hashed((unsigned long) current),
-                                         false,
-                                         scrub_pred, &op) ?: ret;
-               break;
-
-       case BCH_DATA_OP_rereplicate:
-               stats->data_type = BCH_DATA_journal;
-               ret = bch2_journal_flush_device_pins(&c->journal, -1);
-               ret = bch2_move_btree(c, start, end,
-                                     rereplicate_btree_pred, c, stats) ?: ret;
-               ret = bch2_move_data(c, start, end,
-                                    NULL,
-                                    stats,
-                                    writepoint_hashed((unsigned long) current),
-                                    true,
-                                    rereplicate_pred, c) ?: ret;
-               ret = bch2_replicas_gc2(c) ?: ret;
-               break;
-       case BCH_DATA_OP_migrate:
-               if (op.migrate.dev >= c->sb.nr_devices)
-                       return -EINVAL;
-
-               stats->data_type = BCH_DATA_journal;
-               ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
-               ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX,
-                                         ~0,
-                                         NULL,
-                                         stats,
-                                         writepoint_hashed((unsigned long) current),
-                                         true,
-                                         migrate_pred, &op) ?: ret;
-               bch2_btree_interior_updates_flush(c);
-               ret = bch2_replicas_gc2(c) ?: ret;
-               break;
-       case BCH_DATA_OP_rewrite_old_nodes:
-               ret = bch2_scan_old_btree_nodes(c, stats);
-               break;
-       case BCH_DATA_OP_drop_extra_replicas:
-               ret = bch2_move_btree(c, start, end,
-                               drop_extra_replicas_btree_pred, c, stats) ?: ret;
-               ret = bch2_move_data(c, start, end, NULL, stats,
-                               writepoint_hashed((unsigned long) current),
-                               true,
-                               drop_extra_replicas_pred, c) ?: ret;
-               ret = bch2_replicas_gc2(c) ?: ret;
-               break;
-       default:
-               ret = -EINVAL;
-       }
-
-       bch2_move_stats_exit(stats, c);
-       return ret;
-}
-
-void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
-{
-       prt_printf(out, "%s: data type==", stats->name);
-       bch2_prt_data_type(out, stats->data_type);
-       prt_str(out, " pos=");
-       bch2_bbpos_to_text(out, stats->pos);
-       prt_newline(out);
-       printbuf_indent_add(out, 2);
-
-       prt_printf(out, "keys moved:\t%llu\n",  atomic64_read(&stats->keys_moved));
-       prt_printf(out, "keys raced:\t%llu\n",  atomic64_read(&stats->keys_raced));
-       prt_printf(out, "bytes seen:\t");
-       prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
-       prt_newline(out);
-
-       prt_printf(out, "bytes moved:\t");
-       prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
-       prt_newline(out);
-
-       prt_printf(out, "bytes raced:\t");
-       prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
-       prt_newline(out);
-
-       printbuf_indent_sub(out, 2);
-}
-
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
-{
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 32);
-
-       bch2_move_stats_to_text(out, ctxt->stats);
-       printbuf_indent_add(out, 2);
-
-       prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
-                  atomic_read(&ctxt->read_ios),
-                  c->opts.move_ios_in_flight,
-                  atomic_read(&ctxt->read_sectors),
-                  c->opts.move_bytes_in_flight >> 9);
-
-       prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
-                  atomic_read(&ctxt->write_ios),
-                  c->opts.move_ios_in_flight,
-                  atomic_read(&ctxt->write_sectors),
-                  c->opts.move_bytes_in_flight >> 9);
-
-       printbuf_indent_add(out, 2);
-
-       mutex_lock(&ctxt->lock);
-       struct moving_io *io;
-       list_for_each_entry(io, &ctxt->ios, io_list)
-               bch2_data_update_inflight_to_text(out, &io->write);
-       mutex_unlock(&ctxt->lock);
-
-       printbuf_indent_sub(out, 4);
-}
-
-void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       struct moving_context *ctxt;
-
-       mutex_lock(&c->moving_context_lock);
-       list_for_each_entry(ctxt, &c->moving_context_list, list)
-               bch2_moving_ctxt_to_text(out, c, ctxt);
-       mutex_unlock(&c->moving_context_lock);
-}
-
-void bch2_fs_move_init(struct bch_fs *c)
-{
-       INIT_LIST_HEAD(&c->moving_context_list);
-       mutex_init(&c->moving_context_lock);
-}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
deleted file mode 100644 (file)
index 86b8049..0000000
+++ /dev/null
@@ -1,165 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVE_H
-#define _BCACHEFS_MOVE_H
-
-#include "bbpos.h"
-#include "bcachefs_ioctl.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "data_update.h"
-#include "move_types.h"
-
-struct bch_read_bio;
-
-struct moving_context {
-       struct btree_trans      *trans;
-       struct list_head        list;
-       void                    *fn;
-
-       struct bch_ratelimit    *rate;
-       struct bch_move_stats   *stats;
-       struct write_point_specifier wp;
-       bool                    wait_on_copygc;
-       bool                    write_error;
-
-       /* For waiting on outstanding reads and writes: */
-       struct closure          cl;
-
-       struct mutex            lock;
-       struct list_head        reads;
-       struct list_head        ios;
-
-       /* in flight sectors: */
-       atomic_t                read_sectors;
-       atomic_t                write_sectors;
-       atomic_t                read_ios;
-       atomic_t                write_ios;
-
-       wait_queue_head_t       wait;
-};
-
-#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout)                   \
-({                                                                             \
-       int _ret = 0;                                                           \
-       while (true) {                                                          \
-               bool cond_finished = false;                                     \
-               bch2_moving_ctxt_do_pending_writes(_ctxt);                      \
-                                                                               \
-               if (_cond)                                                      \
-                       break;                                                  \
-               bch2_trans_unlock_long((_ctxt)->trans);                         \
-               _ret = __wait_event_timeout((_ctxt)->wait,                      \
-                            bch2_moving_ctxt_next_pending_write(_ctxt) ||      \
-                            (cond_finished = (_cond)), _timeout);              \
-               if (_ret || ( cond_finished))                                   \
-                       break;                                                  \
-       }                                                                       \
-       _ret;                                                                   \
-})
-
-#define move_ctxt_wait_event(_ctxt, _cond)                             \
-do {                                                                   \
-       bool cond_finished = false;                                     \
-       bch2_moving_ctxt_do_pending_writes(_ctxt);                      \
-                                                                       \
-       if (_cond)                                                      \
-               break;                                                  \
-       bch2_trans_unlock_long((_ctxt)->trans);                         \
-       __wait_event((_ctxt)->wait,                                     \
-                    bch2_moving_ctxt_next_pending_write(_ctxt) ||      \
-                    (cond_finished = (_cond)));                        \
-       if (cond_finished)                                              \
-               break;                                                  \
-} while (1)
-
-typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c,
-                            struct bch_io_opts *, struct data_update_opts *);
-
-extern const char * const bch2_data_ops_strs[];
-
-void bch2_moving_ctxt_exit(struct moving_context *);
-void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
-                          struct bch_ratelimit *, struct bch_move_stats *,
-                          struct write_point_specifier, bool);
-struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
-void bch2_moving_ctxt_flush_all(struct moving_context *);
-void bch2_move_ctxt_wait_for_io(struct moving_context *);
-int bch2_move_ratelimit(struct moving_context *);
-
-/* Inodes in different snapshots may have different IO options: */
-struct snapshot_io_opts_entry {
-       u32                     snapshot;
-       struct bch_io_opts      io_opts;
-};
-
-struct per_snapshot_io_opts {
-       u64                     cur_inum;
-       struct bch_io_opts      fs_io_opts;
-       DARRAY(struct snapshot_io_opts_entry) d;
-};
-
-static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
-{
-       memset(io_opts, 0, sizeof(*io_opts));
-       io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
-}
-
-static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
-{
-       darray_exit(&io_opts->d);
-}
-
-int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
-                             struct btree_iter *, struct bkey_s_c);
-
-int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
-
-int bch2_move_extent(struct moving_context *,
-                    struct move_bucket *,
-                    struct btree_iter *,
-                    struct bkey_s_c,
-                    struct bch_io_opts,
-                    struct data_update_opts);
-
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
-                         struct per_snapshot_io_opts *, struct bpos,
-                         struct btree_iter *, struct bkey_s_c);
-
-int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos,
-                        move_pred_fn, void *, enum btree_id, unsigned);
-int __bch2_move_data(struct moving_context *,
-                    struct bbpos,
-                    struct bbpos,
-                    move_pred_fn, void *);
-int bch2_move_data(struct bch_fs *,
-                  struct bbpos start,
-                  struct bbpos end,
-                  struct bch_ratelimit *,
-                  struct bch_move_stats *,
-                  struct write_point_specifier,
-                  bool,
-                  move_pred_fn, void *);
-
-int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned,
-                       struct bch_ratelimit *, struct bch_move_stats *,
-                       struct write_point_specifier, bool,
-                       move_pred_fn, void *);
-
-int bch2_evacuate_bucket(struct moving_context *,
-                          struct move_bucket *,
-                          struct bpos, int,
-                          struct data_update_opts);
-int bch2_data_job(struct bch_fs *,
-                 struct bch_move_stats *,
-                 struct bch_ioctl_data);
-
-void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
-void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
-void bch2_move_stats_init(struct bch_move_stats *, const char *);
-
-void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_move_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
deleted file mode 100644 (file)
index c5c62cd..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVE_TYPES_H
-#define _BCACHEFS_MOVE_TYPES_H
-
-#include "bbpos_types.h"
-#include "bcachefs_ioctl.h"
-
-struct bch_move_stats {
-       char                    name[32];
-       bool                    phys;
-       enum bch_ioctl_data_event_ret   ret;
-
-       union {
-       struct {
-               enum bch_data_type      data_type;
-               struct bbpos            pos;
-       };
-       struct {
-               unsigned                dev;
-               u64                     offset;
-       };
-       };
-
-       atomic64_t              keys_moved;
-       atomic64_t              keys_raced;
-       atomic64_t              sectors_seen;
-       atomic64_t              sectors_moved;
-       atomic64_t              sectors_raced;
-       atomic64_t              sectors_error_corrected;
-       atomic64_t              sectors_error_uncorrected;
-};
-
-struct move_bucket_key {
-       struct bpos             bucket;
-       unsigned                gen;
-};
-
-struct move_bucket {
-       struct move_bucket      *next;
-       struct rhash_head       hash;
-       struct move_bucket_key  k;
-       unsigned                sectors;
-       atomic_t                count;
-};
-
-#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
deleted file mode 100644 (file)
index 5e6de91..0000000
+++ /dev/null
@@ -1,476 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Moving/copying garbage collector
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "clock.h"
-#include "errcode.h"
-#include "error.h"
-#include "lru.h"
-#include "move.h"
-#include "movinggc.h"
-#include "trace.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/sched/task.h>
-#include <linux/wait.h>
-
-struct buckets_in_flight {
-       struct rhashtable       *table;
-       struct move_bucket      *first;
-       struct move_bucket      *last;
-       size_t                  nr;
-       size_t                  sectors;
-
-       DARRAY(struct move_bucket *) to_evacuate;
-};
-
-static const struct rhashtable_params bch_move_bucket_params = {
-       .head_offset            = offsetof(struct move_bucket, hash),
-       .key_offset             = offsetof(struct move_bucket, k),
-       .key_len                = sizeof(struct move_bucket_key),
-       .automatic_shrinking    = true,
-};
-
-static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b)
-{
-       if (!list->first)
-               list->first = b;
-       else
-               list->last->next = b;
-
-       list->last = b;
-       list->nr++;
-       list->sectors += b->sectors;
-}
-
-static int bch2_bucket_is_movable(struct btree_trans *trans,
-                                 struct move_bucket *b, u64 time)
-{
-       struct bch_fs *c = trans->c;
-
-       if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
-               return 0;
-
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
-                                      b->k.bucket, BTREE_ITER_cached);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       struct bch_dev *ca = bch2_dev_bucket_tryget(c, k.k->p);
-       if (!ca)
-               goto out;
-
-       if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset))
-               goto out;
-
-       if (ca->mi.state != BCH_MEMBER_STATE_rw ||
-           !bch2_dev_is_online(ca))
-               goto out;
-
-       struct bch_alloc_v4 _a;
-       const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
-       b->k.gen        = a->gen;
-       b->sectors      = bch2_bucket_sectors_dirty(*a);
-       u64 lru_idx     = alloc_lru_idx_fragmentation(*a, ca);
-
-       ret = lru_idx && lru_idx <= time;
-out:
-       bch2_dev_put(ca);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static void move_bucket_free(struct buckets_in_flight *list,
-                            struct move_bucket *b)
-{
-       int ret = rhashtable_remove_fast(list->table, &b->hash,
-                                        bch_move_bucket_params);
-       BUG_ON(ret);
-       kfree(b);
-}
-
-static void move_buckets_wait(struct moving_context *ctxt,
-                             struct buckets_in_flight *list,
-                             bool flush)
-{
-       struct move_bucket *i;
-
-       while ((i = list->first)) {
-               if (flush)
-                       move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
-
-               if (atomic_read(&i->count))
-                       break;
-
-               list->first = i->next;
-               if (!list->first)
-                       list->last = NULL;
-
-               list->nr--;
-               list->sectors -= i->sectors;
-
-               move_bucket_free(list, i);
-       }
-
-       bch2_trans_unlock_long(ctxt->trans);
-}
-
-static bool bucket_in_flight(struct buckets_in_flight *list,
-                            struct move_bucket_key k)
-{
-       return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params);
-}
-
-static int bch2_copygc_get_buckets(struct moving_context *ctxt,
-                       struct buckets_in_flight *buckets_in_flight)
-{
-       struct btree_trans *trans = ctxt->trans;
-       struct bch_fs *c = trans->c;
-       size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
-       size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
-       int ret;
-
-       move_buckets_wait(ctxt, buckets_in_flight, false);
-
-       ret = bch2_btree_write_buffer_tryflush(trans);
-       if (bch2_err_matches(ret, EROFS))
-               return ret;
-
-       if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
-               return ret;
-
-       ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
-                                 lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
-                                 lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
-                                 0, k, ({
-               struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
-               int ret2 = 0;
-
-               saw++;
-
-               ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
-               if (ret2 < 0)
-                       goto err;
-
-               if (!ret2)
-                       not_movable++;
-               else if (bucket_in_flight(buckets_in_flight, b.k))
-                       in_flight++;
-               else {
-                       struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
-                       ret2 = b_i ? 0 : -ENOMEM;
-                       if (ret2)
-                               goto err;
-
-                       *b_i = b;
-
-                       ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i);
-                       if (ret2) {
-                               kfree(b_i);
-                               goto err;
-                       }
-
-                       ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash,
-                                                            bch_move_bucket_params);
-                       BUG_ON(ret2);
-
-                       sectors += b.sectors;
-               }
-
-               ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get;
-err:
-               ret2;
-       }));
-
-       pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
-                buckets_in_flight->nr, buckets_in_flight->sectors,
-                saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret);
-
-       return ret < 0 ? ret : 0;
-}
-
-noinline
-static int bch2_copygc(struct moving_context *ctxt,
-                      struct buckets_in_flight *buckets_in_flight,
-                      bool *did_work)
-{
-       struct btree_trans *trans = ctxt->trans;
-       struct bch_fs *c = trans->c;
-       struct data_update_opts data_opts = {
-               .btree_insert_flags = BCH_WATERMARK_copygc,
-       };
-       u64 sectors_seen        = atomic64_read(&ctxt->stats->sectors_seen);
-       u64 sectors_moved       = atomic64_read(&ctxt->stats->sectors_moved);
-       int ret = 0;
-
-       ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight);
-       if (ret)
-               goto err;
-
-       darray_for_each(buckets_in_flight->to_evacuate, i) {
-               if (kthread_should_stop() || freezing(current))
-                       break;
-
-               struct move_bucket *b = *i;
-               *i = NULL;
-
-               move_bucket_in_flight_add(buckets_in_flight, b);
-
-               ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts);
-               if (ret)
-                       goto err;
-
-               *did_work = true;
-       }
-err:
-       /* no entries in LRU btree found, or got to end: */
-       if (bch2_err_matches(ret, ENOENT))
-               ret = 0;
-
-       if (ret < 0 && !bch2_err_matches(ret, EROFS))
-               bch_err_msg(c, ret, "from bch2_move_data()");
-
-       sectors_seen    = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
-       sectors_moved   = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
-       trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved);
-
-       darray_for_each(buckets_in_flight->to_evacuate, i)
-               if (*i)
-                       move_bucket_free(buckets_in_flight, *i);
-       darray_exit(&buckets_in_flight->to_evacuate);
-       return ret;
-}
-
-static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca)
-{
-       struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca);
-       struct bch_dev_usage usage;
-
-       for (unsigned i = 0; i < BCH_DATA_NR; i++)
-               usage.buckets[i] = usage_full.d[i].buckets;
-
-       s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
-                                  ca->mi.bucket_size) >> 1);
-       s64 fragmented = 0;
-
-       for (unsigned i = 0; i < BCH_DATA_NR; i++)
-               if (data_type_movable(i))
-                       fragmented += usage_full.d[i].fragmented;
-
-       return max(0LL, fragmented_allowed - fragmented);
-}
-
-/*
- * Copygc runs when the amount of fragmented data is above some arbitrary
- * threshold:
- *
- * The threshold at the limit - when the device is full - is the amount of space
- * we reserved in bch2_recalc_capacity; we can't have more than that amount of
- * disk space stranded due to fragmentation and store everything we have
- * promised to store.
- *
- * But we don't want to be running copygc unnecessarily when the device still
- * has plenty of free space - rather, we want copygc to smoothly run every so
- * often and continually reduce the amount of fragmented space as the device
- * fills up. So, we increase the threshold by half the current free space.
- */
-u64 bch2_copygc_wait_amount(struct bch_fs *c)
-{
-       u64 wait = U64_MAX;
-
-       guard(rcu)();
-       for_each_rw_member_rcu(c, ca)
-               wait = min(wait, bch2_copygc_dev_wait_amount(ca));
-       return wait;
-}
-
-void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       printbuf_tabstop_push(out, 32);
-       prt_printf(out, "running:\t%u\n",               c->copygc_running);
-       prt_printf(out, "copygc_wait:\t%llu\n",         c->copygc_wait);
-       prt_printf(out, "copygc_wait_at:\t%llu\n",      c->copygc_wait_at);
-
-       prt_printf(out, "Currently waiting for:\t");
-       prt_human_readable_u64(out, max(0LL, c->copygc_wait -
-                                       atomic64_read(&c->io_clock[WRITE].now)) << 9);
-       prt_newline(out);
-
-       prt_printf(out, "Currently waiting since:\t");
-       prt_human_readable_u64(out, max(0LL,
-                                       atomic64_read(&c->io_clock[WRITE].now) -
-                                       c->copygc_wait_at) << 9);
-       prt_newline(out);
-
-       bch2_printbuf_make_room(out, 4096);
-
-       struct task_struct *t;
-       out->atomic++;
-       scoped_guard(rcu) {
-               prt_printf(out, "Currently calculated wait:\n");
-               for_each_rw_member_rcu(c, ca) {
-                       prt_printf(out, "  %s:\t", ca->name);
-                       prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
-                       prt_newline(out);
-               }
-
-               t = rcu_dereference(c->copygc_thread);
-               if (t)
-                       get_task_struct(t);
-       }
-       --out->atomic;
-
-       if (t) {
-               bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
-               put_task_struct(t);
-       }
-}
-
-static int bch2_copygc_thread(void *arg)
-{
-       struct bch_fs *c = arg;
-       struct moving_context ctxt;
-       struct bch_move_stats move_stats;
-       struct io_clock *clock = &c->io_clock[WRITE];
-       struct buckets_in_flight buckets = {};
-       u64 last, wait;
-
-       buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL);
-       int ret = !buckets.table
-               ? -ENOMEM
-               : rhashtable_init(buckets.table, &bch_move_bucket_params);
-       bch_err_msg(c, ret, "allocating copygc buckets in flight");
-       if (ret)
-               goto err;
-
-       set_freezable();
-
-       /*
-        * Data move operations can't run until after check_snapshots has
-        * completed, and bch2_snapshot_is_ancestor() is available.
-        */
-       kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
-                              kthread_should_stop());
-
-       bch2_move_stats_init(&move_stats, "copygc");
-       bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
-                             writepoint_ptr(&c->copygc_write_point),
-                             false);
-
-       while (!ret && !kthread_should_stop()) {
-               bool did_work = false;
-
-               bch2_trans_unlock_long(ctxt.trans);
-               cond_resched();
-
-               if (!c->opts.copygc_enabled) {
-                       move_buckets_wait(&ctxt, &buckets, true);
-                       kthread_wait_freezable(c->opts.copygc_enabled ||
-                                              kthread_should_stop());
-               }
-
-               if (unlikely(freezing(current))) {
-                       move_buckets_wait(&ctxt, &buckets, true);
-                       __refrigerator(false);
-                       continue;
-               }
-
-               last = atomic64_read(&clock->now);
-               wait = bch2_copygc_wait_amount(c);
-
-               if (wait > clock->max_slop) {
-                       c->copygc_wait_at = last;
-                       c->copygc_wait = last + wait;
-                       move_buckets_wait(&ctxt, &buckets, true);
-                       trace_and_count(c, copygc_wait, c, wait, last + wait);
-                       bch2_kthread_io_clock_wait(clock, last + wait,
-                                       MAX_SCHEDULE_TIMEOUT);
-                       continue;
-               }
-
-               c->copygc_wait = 0;
-
-               c->copygc_running = true;
-               ret = bch2_copygc(&ctxt, &buckets, &did_work);
-               c->copygc_running = false;
-
-               wake_up(&c->copygc_running_wq);
-
-               if (!wait && !did_work) {
-                       u64 min_member_capacity = bch2_min_rw_member_capacity(c);
-
-                       if (min_member_capacity == U64_MAX)
-                               min_member_capacity = 128 * 2048;
-
-                       move_buckets_wait(&ctxt, &buckets, true);
-                       bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
-                                       MAX_SCHEDULE_TIMEOUT);
-               }
-       }
-
-       move_buckets_wait(&ctxt, &buckets, true);
-       rhashtable_destroy(buckets.table);
-       bch2_moving_ctxt_exit(&ctxt);
-       bch2_move_stats_exit(&move_stats, c);
-err:
-       kfree(buckets.table);
-       return ret;
-}
-
-void bch2_copygc_stop(struct bch_fs *c)
-{
-       if (c->copygc_thread) {
-               kthread_stop(c->copygc_thread);
-               put_task_struct(c->copygc_thread);
-       }
-       c->copygc_thread = NULL;
-}
-
-int bch2_copygc_start(struct bch_fs *c)
-{
-       struct task_struct *t;
-       int ret;
-
-       if (c->copygc_thread)
-               return 0;
-
-       if (c->opts.nochanges)
-               return 0;
-
-       if (bch2_fs_init_fault("copygc_start"))
-               return -ENOMEM;
-
-       t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
-       ret = PTR_ERR_OR_ZERO(t);
-       bch_err_msg(c, ret, "creating copygc thread");
-       if (ret)
-               return ret;
-
-       get_task_struct(t);
-
-       c->copygc_thread = t;
-       wake_up_process(c->copygc_thread);
-
-       return 0;
-}
-
-void bch2_fs_copygc_init(struct bch_fs *c)
-{
-       init_waitqueue_head(&c->copygc_running_wq);
-       c->copygc_running = false;
-}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
deleted file mode 100644 (file)
index f615910..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVINGGC_H
-#define _BCACHEFS_MOVINGGC_H
-
-u64 bch2_copygc_wait_amount(struct bch_fs *);
-void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
-
-static inline void bch2_copygc_wakeup(struct bch_fs *c)
-{
-       guard(rcu)();
-       struct task_struct *p = rcu_dereference(c->copygc_thread);
-       if (p)
-               wake_up_process(p);
-}
-
-void bch2_copygc_stop(struct bch_fs *);
-int bch2_copygc_start(struct bch_fs *);
-void bch2_fs_copygc_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
deleted file mode 100644 (file)
index c3f87c5..0000000
+++ /dev/null
@@ -1,1034 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "btree_update.h"
-#include "dirent.h"
-#include "inode.h"
-#include "namei.h"
-#include "subvolume.h"
-#include "xattr.h"
-
-#include <linux/posix_acl.h>
-
-static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode)
-{
-       return (subvol_inum) {
-               .subvol = inode->bi_parent_subvol ?: inum.subvol,
-               .inum   = inode->bi_dir,
-       };
-}
-
-static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
-{
-       return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
-}
-
-int bch2_create_trans(struct btree_trans *trans,
-                     subvol_inum dir,
-                     struct bch_inode_unpacked *dir_u,
-                     struct bch_inode_unpacked *new_inode,
-                     const struct qstr *name,
-                     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-                     struct posix_acl *default_acl,
-                     struct posix_acl *acl,
-                     subvol_inum snapshot_src,
-                     unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter dir_iter = {};
-       struct btree_iter inode_iter = {};
-       subvol_inum new_inum = dir;
-       u64 now = bch2_current_time(c);
-       u64 cpu = raw_smp_processor_id();
-       u64 dir_target;
-       u32 snapshot;
-       unsigned dir_type = mode_to_type(mode);
-       int ret;
-
-       ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
-       if (ret)
-               goto err;
-
-       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
-                             BTREE_ITER_intent|BTREE_ITER_with_updates);
-       if (ret)
-               goto err;
-
-       if (!(flags & BCH_CREATE_SNAPSHOT)) {
-               /* Normal create path - allocate a new inode: */
-               bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u);
-
-               if (flags & BCH_CREATE_TMPFILE)
-                       new_inode->bi_flags |= BCH_INODE_unlinked;
-
-               ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
-               if (ret)
-                       goto err;
-
-               snapshot_src = (subvol_inum) { 0 };
-       } else {
-               /*
-                * Creating a snapshot - we're not allocating a new inode, but
-                * we do have to lookup the root inode of the subvolume we're
-                * snapshotting and update it (in the new snapshot):
-                */
-
-               if (!snapshot_src.inum) {
-                       /* Inode wasn't specified, just snapshot: */
-                       struct bch_subvolume s;
-                       ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s);
-                       if (ret)
-                               goto err;
-
-                       snapshot_src.inum = le64_to_cpu(s.inode);
-               }
-
-               ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
-                                     BTREE_ITER_intent);
-               if (ret)
-                       goto err;
-
-               if (new_inode->bi_subvol != snapshot_src.subvol) {
-                       /* Not a subvolume root: */
-                       ret = -EINVAL;
-                       goto err;
-               }
-
-               /*
-                * If we're not root, we have to own the subvolume being
-                * snapshotted:
-                */
-               if (uid && new_inode->bi_uid != uid) {
-                       ret = -EPERM;
-                       goto err;
-               }
-
-               flags |= BCH_CREATE_SUBVOL;
-       }
-
-       new_inum.inum   = new_inode->bi_inum;
-       dir_target      = new_inode->bi_inum;
-
-       if (flags & BCH_CREATE_SUBVOL) {
-               u32 new_subvol, dir_snapshot;
-
-               ret = bch2_subvolume_create(trans, new_inode->bi_inum,
-                                           dir.subvol,
-                                           snapshot_src.subvol,
-                                           &new_subvol, &snapshot,
-                                           (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
-               if (ret)
-                       goto err;
-
-               new_inode->bi_parent_subvol     = dir.subvol;
-               new_inode->bi_subvol            = new_subvol;
-               new_inum.subvol                 = new_subvol;
-               dir_target                      = new_subvol;
-               dir_type                        = DT_SUBVOL;
-
-               ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
-               if (ret)
-                       goto err;
-
-               bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot);
-               ret = bch2_btree_iter_traverse(trans, &dir_iter);
-               if (ret)
-                       goto err;
-       }
-
-       if (!(flags & BCH_CREATE_SNAPSHOT)) {
-               if (default_acl) {
-                       ret = bch2_set_acl_trans(trans, new_inum, new_inode,
-                                                default_acl, ACL_TYPE_DEFAULT);
-                       if (ret)
-                               goto err;
-               }
-
-               if (acl) {
-                       ret = bch2_set_acl_trans(trans, new_inum, new_inode,
-                                                acl, ACL_TYPE_ACCESS);
-                       if (ret)
-                               goto err;
-               }
-       }
-
-       if (!(flags & BCH_CREATE_TMPFILE)) {
-               struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
-               u64 dir_offset;
-
-               if (is_subdir_for_nlink(new_inode))
-                       dir_u->bi_nlink++;
-               dir_u->bi_mtime = dir_u->bi_ctime = now;
-
-               ret =   bch2_dirent_create(trans, dir, &dir_hash,
-                                          dir_type,
-                                          name,
-                                          dir_target,
-                                          &dir_offset,
-                                          STR_HASH_must_create|BTREE_ITER_with_updates) ?:
-                       bch2_inode_write(trans, &dir_iter, dir_u);
-               if (ret)
-                       goto err;
-
-               new_inode->bi_dir               = dir_u->bi_inum;
-               new_inode->bi_dir_offset        = dir_offset;
-       }
-
-       if (S_ISDIR(mode)) {
-               ret = bch2_maybe_propagate_has_case_insensitive(trans,
-                               (subvol_inum) {
-                                       new_inode->bi_subvol ?: dir.subvol,
-                                       new_inode->bi_inum },
-                               new_inode);
-               if (ret)
-                       goto err;
-       }
-
-       if (S_ISDIR(mode) &&
-           !new_inode->bi_subvol)
-               new_inode->bi_depth = dir_u->bi_depth + 1;
-
-       inode_iter.flags &= ~BTREE_ITER_all_snapshots;
-       bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot);
-
-       ret   = bch2_btree_iter_traverse(trans, &inode_iter) ?:
-               bch2_inode_write(trans, &inode_iter, new_inode);
-err:
-       bch2_trans_iter_exit(trans, &inode_iter);
-       bch2_trans_iter_exit(trans, &dir_iter);
-       return ret;
-}
-
-int bch2_link_trans(struct btree_trans *trans,
-                   subvol_inum dir,  struct bch_inode_unpacked *dir_u,
-                   subvol_inum inum, struct bch_inode_unpacked *inode_u,
-                   const struct qstr *name)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter dir_iter = {};
-       struct btree_iter inode_iter = {};
-       struct bch_hash_info dir_hash;
-       u64 now = bch2_current_time(c);
-       u64 dir_offset = 0;
-       int ret;
-
-       if (dir.subvol != inum.subvol)
-               return -EXDEV;
-
-       ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
-       if (ret)
-               return ret;
-
-       inode_u->bi_ctime = now;
-       ret = bch2_inode_nlink_inc(inode_u);
-       if (ret)
-               goto err;
-
-       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
-       if (ret)
-               goto err;
-
-       if (bch2_reinherit_attrs(inode_u, dir_u)) {
-               ret = -EXDEV;
-               goto err;
-       }
-
-       dir_u->bi_mtime = dir_u->bi_ctime = now;
-
-       dir_hash = bch2_hash_info_init(c, dir_u);
-
-       ret = bch2_dirent_create(trans, dir, &dir_hash,
-                                mode_to_type(inode_u->bi_mode),
-                                name, inum.inum,
-                                &dir_offset,
-                                STR_HASH_must_create);
-       if (ret)
-               goto err;
-
-       inode_u->bi_dir         = dir.inum;
-       inode_u->bi_dir_offset  = dir_offset;
-
-       ret =   bch2_inode_write(trans, &dir_iter, dir_u) ?:
-               bch2_inode_write(trans, &inode_iter, inode_u);
-err:
-       bch2_trans_iter_exit(trans, &dir_iter);
-       bch2_trans_iter_exit(trans, &inode_iter);
-       return ret;
-}
-
-int bch2_unlink_trans(struct btree_trans *trans,
-                     subvol_inum dir,
-                     struct bch_inode_unpacked *dir_u,
-                     struct bch_inode_unpacked *inode_u,
-                     const struct qstr *name,
-                     bool deleting_subvol)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter dir_iter = {};
-       struct btree_iter dirent_iter = {};
-       struct btree_iter inode_iter = {};
-       struct bch_hash_info dir_hash;
-       subvol_inum inum;
-       u64 now = bch2_current_time(c);
-       struct bkey_s_c k;
-       int ret;
-
-       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
-       if (ret)
-               goto err;
-
-       dir_hash = bch2_hash_info_init(c, dir_u);
-
-       ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
-                                      name, &inum, BTREE_ITER_intent);
-       if (ret)
-               goto err;
-
-       ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
-                             BTREE_ITER_intent);
-       if (ret)
-               goto err;
-
-       if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
-               ret = bch2_empty_dir_trans(trans, inum);
-               if (ret)
-                       goto err;
-       }
-
-       if (deleting_subvol && !inode_u->bi_subvol) {
-               ret = bch_err_throw(c, ENOENT_not_subvol);
-               goto err;
-       }
-
-       if (inode_u->bi_subvol) {
-               /* Recursive subvolume destroy not allowed (yet?) */
-               ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
-               if (ret)
-                       goto err;
-       }
-
-       if (deleting_subvol || inode_u->bi_subvol) {
-               ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
-               if (ret)
-                       goto err;
-
-               k = bch2_btree_iter_peek_slot(trans, &dirent_iter);
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               /*
-                * If we're deleting a subvolume, we need to really delete the
-                * dirent, not just emit a whiteout in the current snapshot:
-                */
-               bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot);
-               ret = bch2_btree_iter_traverse(trans, &dirent_iter);
-               if (ret)
-                       goto err;
-       } else {
-               bch2_inode_nlink_dec(trans, inode_u);
-       }
-
-       if (inode_u->bi_dir             == dirent_iter.pos.inode &&
-           inode_u->bi_dir_offset      == dirent_iter.pos.offset) {
-               inode_u->bi_dir         = 0;
-               inode_u->bi_dir_offset  = 0;
-       }
-
-       dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
-       dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
-
-       ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-                                   &dir_hash, &dirent_iter,
-                                   BTREE_UPDATE_internal_snapshot_node) ?:
-               bch2_inode_write(trans, &dir_iter, dir_u) ?:
-               bch2_inode_write(trans, &inode_iter, inode_u);
-err:
-       bch2_trans_iter_exit(trans, &inode_iter);
-       bch2_trans_iter_exit(trans, &dirent_iter);
-       bch2_trans_iter_exit(trans, &dir_iter);
-       return ret;
-}
-
-bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
-                         struct bch_inode_unpacked *src_u)
-{
-       u64 src, dst;
-       unsigned id;
-       bool ret = false;
-
-       for (id = 0; id < Inode_opt_nr; id++) {
-               if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold)
-                       continue;
-
-               /* Skip attributes that were explicitly set on this inode */
-               if (dst_u->bi_fields_set & (1 << id))
-                       continue;
-
-               src = bch2_inode_opt_get(src_u, id);
-               dst = bch2_inode_opt_get(dst_u, id);
-
-               if (src == dst)
-                       continue;
-
-               bch2_inode_opt_set(dst_u, id, src);
-               ret = true;
-       }
-
-       return ret;
-}
-
-static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
-{
-       struct btree_iter iter;
-       struct bkey_i_subvolume *s =
-               bch2_bkey_get_mut_typed(trans, &iter,
-                       BTREE_ID_subvolumes, POS(0, subvol),
-                       BTREE_ITER_cached, subvolume);
-       int ret = PTR_ERR_OR_ZERO(s);
-       if (ret)
-               return ret;
-
-       s->v.fs_path_parent = cpu_to_le32(new_parent);
-       bch2_trans_iter_exit(trans, &iter);
-       return 0;
-}
-
-int bch2_rename_trans(struct btree_trans *trans,
-                     subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
-                     subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
-                     struct bch_inode_unpacked *src_inode_u,
-                     struct bch_inode_unpacked *dst_inode_u,
-                     const struct qstr *src_name,
-                     const struct qstr *dst_name,
-                     enum bch_rename_mode mode)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter src_dir_iter = {};
-       struct btree_iter dst_dir_iter = {};
-       struct btree_iter src_inode_iter = {};
-       struct btree_iter dst_inode_iter = {};
-       struct bch_hash_info src_hash, dst_hash;
-       subvol_inum src_inum, dst_inum;
-       u64 src_offset, dst_offset;
-       u64 now = bch2_current_time(c);
-       int ret;
-
-       ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
-                             BTREE_ITER_intent);
-       if (ret)
-               goto err;
-
-       src_hash = bch2_hash_info_init(c, src_dir_u);
-
-       if (!subvol_inum_eq(dst_dir, src_dir)) {
-               ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
-                                     BTREE_ITER_intent);
-               if (ret)
-                       goto err;
-
-               dst_hash = bch2_hash_info_init(c, dst_dir_u);
-       } else {
-               dst_dir_u = src_dir_u;
-               dst_hash = src_hash;
-       }
-
-       ret = bch2_dirent_rename(trans,
-                                src_dir, &src_hash,
-                                dst_dir, &dst_hash,
-                                src_name, &src_inum, &src_offset,
-                                dst_name, &dst_inum, &dst_offset,
-                                mode);
-       if (ret)
-               goto err;
-
-       ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
-                             BTREE_ITER_intent);
-       if (ret)
-               goto err;
-
-       if (dst_inum.inum) {
-               ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
-                                     BTREE_ITER_intent);
-               if (ret)
-                       goto err;
-       }
-
-       if (src_inode_u->bi_subvol &&
-           dst_dir.subvol != src_inode_u->bi_parent_subvol) {
-               ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
-               if (ret)
-                       goto err;
-       }
-
-       if (mode == BCH_RENAME_EXCHANGE &&
-           dst_inode_u->bi_subvol &&
-           src_dir.subvol != dst_inode_u->bi_parent_subvol) {
-               ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
-               if (ret)
-                       goto err;
-       }
-
-       /* Can't move across subvolumes, unless it's a subvolume root: */
-       if (src_dir.subvol != dst_dir.subvol &&
-           (!src_inode_u->bi_subvol ||
-            (dst_inum.inum && !dst_inode_u->bi_subvol))) {
-               ret = -EXDEV;
-               goto err;
-       }
-
-       if (src_inode_u->bi_parent_subvol)
-               src_inode_u->bi_parent_subvol = dst_dir.subvol;
-
-       if ((mode == BCH_RENAME_EXCHANGE) &&
-           dst_inode_u->bi_parent_subvol)
-               dst_inode_u->bi_parent_subvol = src_dir.subvol;
-
-       src_inode_u->bi_dir             = dst_dir_u->bi_inum;
-       src_inode_u->bi_dir_offset      = dst_offset;
-
-       if (mode == BCH_RENAME_EXCHANGE) {
-               dst_inode_u->bi_dir             = src_dir_u->bi_inum;
-               dst_inode_u->bi_dir_offset      = src_offset;
-       }
-
-       if (mode == BCH_RENAME_OVERWRITE &&
-           dst_inode_u->bi_dir         == dst_dir_u->bi_inum &&
-           dst_inode_u->bi_dir_offset  == src_offset) {
-               dst_inode_u->bi_dir             = 0;
-               dst_inode_u->bi_dir_offset      = 0;
-       }
-
-       if (mode == BCH_RENAME_OVERWRITE) {
-               if (S_ISDIR(src_inode_u->bi_mode) !=
-                   S_ISDIR(dst_inode_u->bi_mode)) {
-                       ret = -ENOTDIR;
-                       goto err;
-               }
-
-               if (S_ISDIR(dst_inode_u->bi_mode)) {
-                       ret = bch2_empty_dir_trans(trans, dst_inum);
-                       if (ret)
-                               goto err;
-               }
-       }
-
-       if (!subvol_inum_eq(dst_dir, src_dir)) {
-               if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
-                   S_ISDIR(src_inode_u->bi_mode)) {
-                       ret = -EXDEV;
-                       goto err;
-               }
-
-               if (mode == BCH_RENAME_EXCHANGE &&
-                   bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
-                   S_ISDIR(dst_inode_u->bi_mode)) {
-                       ret = -EXDEV;
-                       goto err;
-               }
-
-               ret =   bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?:
-                       (mode == BCH_RENAME_EXCHANGE
-                        ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u)
-                        : 0);
-               if (ret)
-                       goto err;
-
-               if (is_subdir_for_nlink(src_inode_u)) {
-                       src_dir_u->bi_nlink--;
-                       dst_dir_u->bi_nlink++;
-               }
-
-               if (S_ISDIR(src_inode_u->bi_mode) &&
-                   !src_inode_u->bi_subvol)
-                       src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
-
-               if (mode == BCH_RENAME_EXCHANGE &&
-                   S_ISDIR(dst_inode_u->bi_mode) &&
-                   !dst_inode_u->bi_subvol)
-                       dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
-       }
-
-       if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
-               dst_dir_u->bi_nlink--;
-               src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
-       }
-
-       if (mode == BCH_RENAME_OVERWRITE)
-               bch2_inode_nlink_dec(trans, dst_inode_u);
-
-       src_dir_u->bi_mtime             = now;
-       src_dir_u->bi_ctime             = now;
-
-       if (src_dir.inum != dst_dir.inum) {
-               dst_dir_u->bi_mtime     = now;
-               dst_dir_u->bi_ctime     = now;
-       }
-
-       src_inode_u->bi_ctime           = now;
-
-       if (dst_inum.inum)
-               dst_inode_u->bi_ctime   = now;
-
-       ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
-               (src_dir.inum != dst_dir.inum
-                ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
-                : 0) ?:
-               bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
-               (dst_inum.inum
-                ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
-                : 0);
-err:
-       bch2_trans_iter_exit(trans, &dst_inode_iter);
-       bch2_trans_iter_exit(trans, &src_inode_iter);
-       bch2_trans_iter_exit(trans, &dst_dir_iter);
-       bch2_trans_iter_exit(trans, &src_dir_iter);
-       return ret;
-}
-
-/* inum_to_path */
-
-static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
-{
-       bch2_printbuf_make_room(out, n);
-
-       unsigned can_print = min(n, printbuf_remaining(out));
-
-       b += n;
-
-       for (unsigned i = 0; i < can_print; i++)
-               out->buf[out->pos++] = *((char *) --b);
-
-       printbuf_nul_terminate(out);
-}
-
-static inline void prt_str_reversed(struct printbuf *out, const char *s)
-{
-       prt_bytes_reversed(out, s, strlen(s));
-}
-
-static inline void reverse_bytes(void *b, size_t n)
-{
-       char *e = b + n, *s = b;
-
-       while (s < e) {
-               --e;
-               swap(*s, *e);
-               s++;
-       }
-}
-
-static int __bch2_inum_to_path(struct btree_trans *trans,
-                              u32 subvol, u64 inum, u32 snapshot,
-                              struct printbuf *path)
-{
-       unsigned orig_pos = path->pos;
-       int ret = 0;
-       DARRAY(subvol_inum) inums = {};
-
-       if (!snapshot) {
-               ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot);
-               if (ret)
-                       goto disconnected;
-       }
-
-       while (true) {
-               subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum };
-
-               if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) {
-                       prt_str_reversed(path, "(loop)");
-                       break;
-               }
-
-               ret = darray_push(&inums, n);
-               if (ret)
-                       goto err;
-
-               struct bch_inode_unpacked inode;
-               ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0);
-               if (ret)
-                       goto disconnected;
-
-               if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL &&
-                   inode.bi_inum == BCACHEFS_ROOT_INO)
-                       break;
-
-               if (!inode.bi_dir && !inode.bi_dir_offset) {
-                       ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer);
-                       goto disconnected;
-               }
-
-               inum = inode.bi_dir;
-               if (inode.bi_parent_subvol) {
-                       subvol = inode.bi_parent_subvol;
-                       ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot);
-                       if (ret)
-                               goto disconnected;
-               }
-
-               struct btree_iter d_iter;
-               struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
-                               BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot),
-                               0, dirent);
-               ret = bkey_err(d.s_c);
-               if (ret)
-                       goto disconnected;
-
-               struct qstr dirent_name = bch2_dirent_get_name(d);
-
-               prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
-
-               prt_char(path, '/');
-
-               bch2_trans_iter_exit(trans, &d_iter);
-       }
-
-       if (orig_pos == path->pos)
-               prt_char(path, '/');
-out:
-       ret = path->allocation_failure ? -ENOMEM : 0;
-       if (ret)
-               goto err;
-
-       reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
-       darray_exit(&inums);
-       return 0;
-err:
-       darray_exit(&inums);
-       return ret;
-disconnected:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto err;
-
-       prt_str_reversed(path, "(disconnected)");
-       goto out;
-}
-
-int bch2_inum_to_path(struct btree_trans *trans,
-                     subvol_inum inum,
-                     struct printbuf *path)
-{
-       return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path);
-}
-
-int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot,
-                              snapshot_id_list *snapshot_overwrites,
-                              struct printbuf *path)
-{
-       return __bch2_inum_to_path(trans, 0, inum, snapshot, path);
-}
-
-/* fsck */
-
-static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
-                                         struct bkey_s_c_dirent d,
-                                         struct bch_inode_unpacked *target,
-                                         bool in_fsck)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       struct btree_iter bp_iter = {};
-       int ret = 0;
-
-       if (inode_points_to_dirent(target, d))
-               return 0;
-
-       if (!bch2_inode_has_backpointer(target)) {
-               fsck_err_on(S_ISDIR(target->bi_mode),
-                           trans, inode_dir_missing_backpointer,
-                           "directory with missing backpointer\n%s",
-                           (printbuf_reset(&buf),
-                            bch2_bkey_val_to_text(&buf, c, d.s_c),
-                            prt_printf(&buf, "\n"),
-                            bch2_inode_unpacked_to_text(&buf, target),
-                            buf.buf));
-
-               fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
-                           trans, inode_unlinked_but_has_dirent,
-                           "inode unlinked but has dirent\n%s",
-                           (printbuf_reset(&buf),
-                            bch2_bkey_val_to_text(&buf, c, d.s_c),
-                            prt_printf(&buf, "\n"),
-                            bch2_inode_unpacked_to_text(&buf, target),
-                            buf.buf));
-
-               target->bi_flags &= ~BCH_INODE_unlinked;
-               target->bi_dir          = d.k->p.inode;
-               target->bi_dir_offset   = d.k->p.offset;
-               return __bch2_fsck_write_inode(trans, target);
-       }
-
-       struct bkey_s_c_dirent bp_dirent =
-               bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
-                             SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
-                             0, dirent);
-       ret = bkey_err(bp_dirent);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               goto err;
-
-       bool backpointer_exists = !ret;
-       ret = 0;
-
-       if (!backpointer_exists) {
-               if (fsck_err(trans, inode_wrong_backpointer,
-                            "inode %llu:%u has wrong backpointer:\n"
-                            "got       %llu:%llu\n"
-                            "should be %llu:%llu",
-                            target->bi_inum, target->bi_snapshot,
-                            target->bi_dir,
-                            target->bi_dir_offset,
-                            d.k->p.inode,
-                            d.k->p.offset)) {
-                       target->bi_dir          = d.k->p.inode;
-                       target->bi_dir_offset   = d.k->p.offset;
-                       ret = __bch2_fsck_write_inode(trans, target);
-               }
-       } else {
-               printbuf_reset(&buf);
-               bch2_bkey_val_to_text(&buf, c, d.s_c);
-               prt_newline(&buf);
-               bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
-
-               if (S_ISDIR(target->bi_mode) || target->bi_subvol) {
-                       /*
-                        * XXX: verify connectivity of the other dirent
-                        * up to the root before removing this one
-                        *
-                        * Additionally, bch2_lookup would need to cope with the
-                        * dirent it found being removed - or should we remove
-                        * the other one, even though the inode points to it?
-                        */
-                       if (in_fsck) {
-                               if (fsck_err(trans, inode_dir_multiple_links,
-                                            "%s %llu:%u with multiple links\n%s",
-                                            S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
-                                            target->bi_inum, target->bi_snapshot, buf.buf))
-                                       ret = bch2_fsck_remove_dirent(trans, d.k->p);
-                       } else {
-                               bch2_fs_inconsistent(c,
-                                               "%s %llu:%u with multiple links\n%s",
-                                               S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
-                                               target->bi_inum, target->bi_snapshot, buf.buf);
-                       }
-
-                       goto out;
-               } else {
-                       /*
-                        * hardlinked file with nlink 0:
-                        * We're just adjusting nlink here so check_nlinks() will pick
-                        * it up, it ignores inodes with nlink 0
-                        */
-                       if (fsck_err_on(!target->bi_nlink,
-                                       trans, inode_multiple_links_but_nlink_0,
-                                       "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
-                                       target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
-                               target->bi_nlink++;
-                               target->bi_flags &= ~BCH_INODE_unlinked;
-                               ret = __bch2_fsck_write_inode(trans, target);
-                               if (ret)
-                                       goto err;
-                       }
-               }
-       }
-out:
-err:
-fsck_err:
-       bch2_trans_iter_exit(trans, &bp_iter);
-       printbuf_exit(&buf);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-int __bch2_check_dirent_target(struct btree_trans *trans,
-                              struct btree_iter *dirent_iter,
-                              struct bkey_s_c_dirent d,
-                              struct bch_inode_unpacked *target,
-                              bool in_fsck)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck);
-       if (ret)
-               goto err;
-
-       if (fsck_err_on(d.v->d_type != inode_d_type(target),
-                       trans, dirent_d_type_wrong,
-                       "incorrect d_type: got %s, should be %s:\n%s",
-                       bch2_d_type_str(d.v->d_type),
-                       bch2_d_type_str(inode_d_type(target)),
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
-               struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
-               ret = PTR_ERR_OR_ZERO(n);
-               if (ret)
-                       goto err;
-
-               bkey_reassemble(&n->k_i, d.s_c);
-               n->v.d_type = inode_d_type(target);
-               if (n->v.d_type == DT_SUBVOL) {
-                       n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
-                       n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
-               } else {
-                       n->v.d_inum = cpu_to_le64(target->bi_inum);
-               }
-
-               ret = bch2_trans_update(trans, dirent_iter, &n->k_i,
-                                       BTREE_UPDATE_internal_snapshot_node);
-               if (ret)
-                       goto err;
-       }
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-/*
- * BCH_INODE_has_case_insensitive:
- * We have to track whether directories have any descendent directory that is
- * casefolded - for overlayfs:
- */
-
-static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum)
-{
-       struct btree_iter iter = {};
-       int ret = 0;
-
-       while (true) {
-               struct bch_inode_unpacked inode;
-               ret = bch2_inode_peek(trans, &iter, &inode, inum,
-                                     BTREE_ITER_intent|BTREE_ITER_with_updates);
-               if (ret)
-                       break;
-
-               if (inode.bi_flags & BCH_INODE_has_case_insensitive)
-                       break;
-
-               inode.bi_flags |= BCH_INODE_has_case_insensitive;
-               ret = bch2_inode_write(trans, &iter, &inode);
-               if (ret)
-                       break;
-
-               bch2_trans_iter_exit(trans, &iter);
-               if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM))
-                       break;
-
-               inum = parent_inum(inum, &inode);
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum,
-                                             struct bch_inode_unpacked *inode)
-{
-       if (!bch2_inode_casefold(trans->c, inode))
-               return 0;
-
-       inode->bi_flags |= BCH_INODE_has_case_insensitive;
-
-       return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode));
-}
-
-int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
-                                         struct bch_inode_unpacked *inode,
-                                         snapshot_id_list *snapshot_overwrites,
-                                         bool *do_update)
-{
-       struct printbuf buf = PRINTBUF;
-       bool repairing_parents = false;
-       int ret = 0;
-
-       if (!S_ISDIR(inode->bi_mode)) {
-               /*
-                * Old versions set bi_casefold for non dirs, but that's
-                * unnecessary and wasteful
-                */
-               if (inode->bi_casefold) {
-                       inode->bi_casefold = 0;
-                       *do_update = true;
-               }
-               return 0;
-       }
-
-       if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive)
-               return 0;
-
-       if (bch2_inode_casefold(trans->c, inode) &&
-           !(inode->bi_flags & BCH_INODE_has_case_insensitive)) {
-               prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ",
-                          inode->bi_inum, inode->bi_snapshot);
-
-               ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot,
-                                                snapshot_overwrites, &buf);
-               if (ret)
-                       goto err;
-
-               if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) {
-                       inode->bi_flags |= BCH_INODE_has_case_insensitive;
-                       *do_update = true;
-               }
-       }
-
-       if (!(inode->bi_flags & BCH_INODE_has_case_insensitive))
-               goto out;
-
-       struct bch_inode_unpacked dir = *inode;
-       u32 snapshot = dir.bi_snapshot;
-
-       while (!(dir.bi_inum    == BCACHEFS_ROOT_INO &&
-                dir.bi_subvol  == BCACHEFS_ROOT_SUBVOL)) {
-               if (dir.bi_parent_subvol) {
-                       ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot);
-                       if (ret)
-                               goto err;
-
-                       snapshot_overwrites = NULL;
-               }
-
-               ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0);
-               if (ret)
-                       goto err;
-
-               if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) {
-                       prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n");
-
-                       ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot,
-                                                        snapshot_overwrites, &buf);
-                       if (ret)
-                               goto err;
-
-                       if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) {
-                               dir.bi_flags |= BCH_INODE_has_case_insensitive;
-                               ret = __bch2_fsck_write_inode(trans, &dir);
-                               if (ret)
-                                       goto err;
-                       }
-               }
-
-               /*
-                * We only need to check the first parent, unless we find an
-                * inconsistency
-                */
-               if (!repairing_parents)
-                       break;
-       }
-out:
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       if (ret)
-               return ret;
-
-       if (repairing_parents) {
-               return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-                       -BCH_ERR_transaction_restart_nested;
-       }
-
-       return 0;
-}
diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h
deleted file mode 100644 (file)
index ae6ebc2..0000000
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NAMEI_H
-#define _BCACHEFS_NAMEI_H
-
-#include "dirent.h"
-
-struct posix_acl;
-
-#define BCH_CREATE_TMPFILE             (1U << 0)
-#define BCH_CREATE_SUBVOL              (1U << 1)
-#define BCH_CREATE_SNAPSHOT            (1U << 2)
-#define BCH_CREATE_SNAPSHOT_RO         (1U << 3)
-
-int bch2_create_trans(struct btree_trans *, subvol_inum,
-                     struct bch_inode_unpacked *,
-                     struct bch_inode_unpacked *,
-                     const struct qstr *,
-                     uid_t, gid_t, umode_t, dev_t,
-                     struct posix_acl *,
-                     struct posix_acl *,
-                     subvol_inum, unsigned);
-
-int bch2_link_trans(struct btree_trans *,
-                   subvol_inum, struct bch_inode_unpacked *,
-                   subvol_inum, struct bch_inode_unpacked *,
-                   const struct qstr *);
-
-int bch2_unlink_trans(struct btree_trans *, subvol_inum,
-                     struct bch_inode_unpacked *,
-                     struct bch_inode_unpacked *,
-                     const struct qstr *, bool);
-
-int bch2_rename_trans(struct btree_trans *,
-                     subvol_inum, struct bch_inode_unpacked *,
-                     subvol_inum, struct bch_inode_unpacked *,
-                     struct bch_inode_unpacked *,
-                     struct bch_inode_unpacked *,
-                     const struct qstr *,
-                     const struct qstr *,
-                     enum bch_rename_mode);
-
-bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
-                         struct bch_inode_unpacked *);
-
-int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
-int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32,
-                              snapshot_id_list *, struct printbuf *);
-
-int __bch2_check_dirent_target(struct btree_trans *,
-                              struct btree_iter *,
-                              struct bkey_s_c_dirent,
-                              struct bch_inode_unpacked *, bool);
-
-static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
-                                         struct bkey_s_c_dirent d)
-{
-       return  inode->bi_dir           == d.k->p.inode &&
-               inode->bi_dir_offset    == d.k->p.offset;
-}
-
-static inline int bch2_check_dirent_target(struct btree_trans *trans,
-                                          struct btree_iter *dirent_iter,
-                                          struct bkey_s_c_dirent d,
-                                          struct bch_inode_unpacked *target,
-                                          bool in_fsck)
-{
-       if (likely(inode_points_to_dirent(target, d) &&
-                  d.v->d_type == inode_d_type(target)))
-               return 0;
-
-       return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck);
-}
-
-int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum,
-                                             struct bch_inode_unpacked *);
-int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *,
-                                         snapshot_id_list *, bool *);
-
-#endif /* _BCACHEFS_NAMEI_H */
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
deleted file mode 100644 (file)
index 962218f..0000000
+++ /dev/null
@@ -1,142 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "nocow_locking.h"
-#include "util.h"
-
-#include <linux/closure.h>
-
-bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
-{
-       u64 dev_bucket = bucket_to_u64(bucket);
-       struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(l->b); i++)
-               if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
-                       return true;
-       return false;
-}
-
-#define sign(v)                (v < 0 ? -1 : v > 0 ? 1 : 0)
-
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
-{
-       u64 dev_bucket = bucket_to_u64(bucket);
-       struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-       int lock_val = flags ? 1 : -1;
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(l->b); i++)
-               if (l->b[i] == dev_bucket) {
-                       int v = atomic_sub_return(lock_val, &l->l[i]);
-
-                       BUG_ON(v && sign(v) != lock_val);
-                       if (!v)
-                               closure_wake_up(&l->wait);
-                       return;
-               }
-
-       BUG();
-}
-
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
-                                u64 dev_bucket, int flags)
-{
-       int v, lock_val = flags ? 1 : -1;
-       unsigned i;
-
-       spin_lock(&l->lock);
-
-       for (i = 0; i < ARRAY_SIZE(l->b); i++)
-               if (l->b[i] == dev_bucket)
-                       goto got_entry;
-
-       for (i = 0; i < ARRAY_SIZE(l->b); i++)
-               if (!atomic_read(&l->l[i])) {
-                       l->b[i] = dev_bucket;
-                       goto take_lock;
-               }
-fail:
-       spin_unlock(&l->lock);
-       return false;
-got_entry:
-       v = atomic_read(&l->l[i]);
-       if (lock_val > 0 ? v < 0 : v > 0)
-               goto fail;
-take_lock:
-       v = atomic_read(&l->l[i]);
-       /* Overflow? */
-       if (v && sign(v + lock_val) != sign(v))
-               goto fail;
-
-       atomic_add(lock_val, &l->l[i]);
-       spin_unlock(&l->lock);
-       return true;
-}
-
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
-                             struct nocow_lock_bucket *l,
-                             u64 dev_bucket, int flags)
-{
-       if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
-               struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
-               u64 start_time = local_clock();
-
-               __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
-               bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
-       }
-}
-
-void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
-
-{
-       unsigned i, nr_zero = 0;
-       struct nocow_lock_bucket *l;
-
-       for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
-               unsigned v = 0;
-
-               for (i = 0; i < ARRAY_SIZE(l->l); i++)
-                       v |= atomic_read(&l->l[i]);
-
-               if (!v) {
-                       nr_zero++;
-                       continue;
-               }
-
-               if (nr_zero)
-                       prt_printf(out, "(%u empty entries)\n", nr_zero);
-               nr_zero = 0;
-
-               for (i = 0; i < ARRAY_SIZE(l->l); i++) {
-                       int v = atomic_read(&l->l[i]);
-                       if (v) {
-                               bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
-                               prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
-                       }
-               }
-               prt_newline(out);
-       }
-
-       if (nr_zero)
-               prt_printf(out, "(%u empty entries)\n", nr_zero);
-}
-
-void bch2_fs_nocow_locking_exit(struct bch_fs *c)
-{
-       struct bucket_nocow_lock_table *t = &c->nocow_locks;
-
-       for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
-               for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
-                       BUG_ON(atomic_read(&l->l[j]));
-}
-
-void bch2_fs_nocow_locking_init_early(struct bch_fs *c)
-{
-       struct bucket_nocow_lock_table *t = &c->nocow_locks;
-
-       for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
-               spin_lock_init(&l->lock);
-}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
deleted file mode 100644 (file)
index 48b8a00..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NOCOW_LOCKING_H
-#define _BCACHEFS_NOCOW_LOCKING_H
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "nocow_locking_types.h"
-
-#include <linux/hash.h>
-
-static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
-                                                         u64 dev_bucket)
-{
-       unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
-
-       return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
-}
-
-#define BUCKET_NOCOW_LOCK_UPDATE       (1 << 0)
-
-bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
-                             struct nocow_lock_bucket *, u64, int);
-
-static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
-                                         struct bpos bucket, int flags)
-{
-       u64 dev_bucket = bucket_to_u64(bucket);
-       struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
-       __bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
-}
-
-static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
-                                         struct bpos bucket, int flags)
-{
-       u64 dev_bucket = bucket_to_u64(bucket);
-       struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
-       return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
-}
-
-void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
-
-void bch2_fs_nocow_locking_exit(struct bch_fs *);
-void bch2_fs_nocow_locking_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
deleted file mode 100644 (file)
index bd12bf6..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H
-#define _BCACHEFS_NOCOW_LOCKING_TYPES_H
-
-#define BUCKET_NOCOW_LOCKS_BITS                10
-#define BUCKET_NOCOW_LOCKS             (1U << BUCKET_NOCOW_LOCKS_BITS)
-
-struct nocow_lock_bucket {
-       struct closure_waitlist         wait;
-       spinlock_t                      lock;
-       u64                             b[4];
-       atomic_t                        l[4];
-} __aligned(SMP_CACHE_BYTES);
-
-struct bucket_nocow_lock_table {
-       struct nocow_lock_bucket        l[BUCKET_NOCOW_LOCKS];
-};
-
-#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */
-
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
deleted file mode 100644 (file)
index b1cf889..0000000
+++ /dev/null
@@ -1,844 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/kernel.h>
-#include <linux/fs_parser.h>
-
-#include "bcachefs.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "movinggc.h"
-#include "opts.h"
-#include "rebalance.h"
-#include "recovery_passes.h"
-#include "super-io.h"
-#include "util.h"
-
-#define x(t, n, ...) [n] = #t,
-
-const char * const bch2_error_actions[] = {
-       BCH_ERROR_ACTIONS()
-       NULL
-};
-
-const char * const bch2_degraded_actions[] = {
-       BCH_DEGRADED_ACTIONS()
-       NULL
-};
-
-const char * const bch2_fsck_fix_opts[] = {
-       BCH_FIX_ERRORS_OPTS()
-       NULL
-};
-
-const char * const bch2_version_upgrade_opts[] = {
-       BCH_VERSION_UPGRADE_OPTS()
-       NULL
-};
-
-const char * const bch2_sb_features[] = {
-       BCH_SB_FEATURES()
-       NULL
-};
-
-const char * const bch2_sb_compat[] = {
-       BCH_SB_COMPAT()
-       NULL
-};
-
-const char * const __bch2_btree_ids[] = {
-       BCH_BTREE_IDS()
-       NULL
-};
-
-const char * const __bch2_csum_types[] = {
-       BCH_CSUM_TYPES()
-       NULL
-};
-
-const char * const __bch2_csum_opts[] = {
-       BCH_CSUM_OPTS()
-       NULL
-};
-
-const char * const __bch2_compression_types[] = {
-       BCH_COMPRESSION_TYPES()
-       NULL
-};
-
-const char * const bch2_compression_opts[] = {
-       BCH_COMPRESSION_OPTS()
-       NULL
-};
-
-const char * const __bch2_str_hash_types[] = {
-       BCH_STR_HASH_TYPES()
-       NULL
-};
-
-const char * const bch2_str_hash_opts[] = {
-       BCH_STR_HASH_OPTS()
-       NULL
-};
-
-const char * const __bch2_data_types[] = {
-       BCH_DATA_TYPES()
-       NULL
-};
-
-const char * const bch2_member_states[] = {
-       BCH_MEMBER_STATES()
-       NULL
-};
-
-static const char * const __bch2_jset_entry_types[] = {
-       BCH_JSET_ENTRY_TYPES()
-       NULL
-};
-
-static const char * const __bch2_fs_usage_types[] = {
-       BCH_FS_USAGE_TYPES()
-       NULL
-};
-
-#undef x
-
-static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
-                                   unsigned nr, const char *type, unsigned idx)
-{
-       if (idx < nr)
-               prt_str(out, opts[idx]);
-       else
-               prt_printf(out, "(unknown %s %u)", type, idx);
-}
-
-#define PRT_STR_OPT_BOUNDSCHECKED(name, type)                                  \
-void bch2_prt_##name(struct printbuf *out, type t)                             \
-{                                                                              \
-       prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
-}
-
-PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type,     enum bch_jset_entry_type);
-PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type,       enum bch_fs_usage_type);
-PRT_STR_OPT_BOUNDSCHECKED(data_type,           enum bch_data_type);
-PRT_STR_OPT_BOUNDSCHECKED(csum_opt,            enum bch_csum_opt);
-PRT_STR_OPT_BOUNDSCHECKED(csum_type,           enum bch_csum_type);
-PRT_STR_OPT_BOUNDSCHECKED(compression_type,    enum bch_compression_type);
-PRT_STR_OPT_BOUNDSCHECKED(str_hash_type,       enum bch_str_hash_type);
-
-static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
-                                    struct printbuf *err)
-{
-       if (!val) {
-               *res = FSCK_FIX_yes;
-       } else {
-               int ret = match_string(bch2_fsck_fix_opts, -1, val);
-
-               if (ret < 0 && err)
-                       prt_str(err, "fix_errors: invalid selection");
-               if (ret < 0)
-                       return ret;
-               *res = ret;
-       }
-
-       return 0;
-}
-
-static void bch2_opt_fix_errors_to_text(struct printbuf *out,
-                                       struct bch_fs *c,
-                                       struct bch_sb *sb,
-                                       u64 v)
-{
-       prt_str(out, bch2_fsck_fix_opts[v]);
-}
-
-#define bch2_opt_fix_errors (struct bch_opt_fn) {      \
-       .parse = bch2_opt_fix_errors_parse,             \
-       .to_text = bch2_opt_fix_errors_to_text,         \
-}
-
-const char * const bch2_d_types[BCH_DT_MAX] = {
-       [DT_UNKNOWN]    = "unknown",
-       [DT_FIFO]       = "fifo",
-       [DT_CHR]        = "chr",
-       [DT_DIR]        = "dir",
-       [DT_BLK]        = "blk",
-       [DT_REG]        = "reg",
-       [DT_LNK]        = "lnk",
-       [DT_SOCK]       = "sock",
-       [DT_WHT]        = "whiteout",
-       [DT_SUBVOL]     = "subvol",
-};
-
-void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
-{
-#define x(_name, ...)                                          \
-       if (opt_defined(src, _name))                                    \
-               opt_set(*dst, _name, src._name);
-
-       BCH_OPTS()
-#undef x
-}
-
-bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
-{
-       switch (id) {
-#define x(_name, ...)                                          \
-       case Opt_##_name:                                               \
-               return opt_defined(*opts, _name);
-       BCH_OPTS()
-#undef x
-       default:
-               BUG();
-       }
-}
-
-u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
-{
-       switch (id) {
-#define x(_name, ...)                                          \
-       case Opt_##_name:                                               \
-               return opts->_name;
-       BCH_OPTS()
-#undef x
-       default:
-               BUG();
-       }
-}
-
-void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
-{
-       switch (id) {
-#define x(_name, ...)                                          \
-       case Opt_##_name:                                               \
-               opt_set(*opts, _name, v);                               \
-               break;
-       BCH_OPTS()
-#undef x
-       default:
-               BUG();
-       }
-}
-
-/* dummy option, for options that aren't stored in the superblock */
-typedef u64 (*sb_opt_get_fn)(const struct bch_sb *);
-typedef void (*sb_opt_set_fn)(struct bch_sb *, u64);
-typedef u64 (*member_opt_get_fn)(const struct bch_member *);
-typedef void (*member_opt_set_fn)(struct bch_member *, u64);
-
-__maybe_unused static const sb_opt_get_fn      BCH2_NO_SB_OPT = NULL;
-__maybe_unused static const sb_opt_set_fn      SET_BCH2_NO_SB_OPT = NULL;
-__maybe_unused static const member_opt_get_fn  BCH2_NO_MEMBER_OPT = NULL;
-__maybe_unused static const member_opt_set_fn  SET_BCH2_NO_MEMBER_OPT = NULL;
-
-#define type_compatible_or_null(_p, _type)                             \
-       __builtin_choose_expr(                                          \
-               __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL)
-
-const struct bch_option bch2_opt_table[] = {
-#define OPT_BOOL()             .type = BCH_OPT_BOOL, .min = 0, .max = 2
-#define OPT_UINT(_min, _max)   .type = BCH_OPT_UINT,                   \
-                               .min = _min, .max = _max
-#define OPT_STR(_choices)      .type = BCH_OPT_STR,                    \
-                               .min = 0, .max = ARRAY_SIZE(_choices) - 1, \
-                               .choices = _choices
-#define OPT_STR_NOLIMIT(_choices)      .type = BCH_OPT_STR,            \
-                               .min = 0, .max = U64_MAX,               \
-                               .choices = _choices
-#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD,               \
-                               .choices = _choices
-#define OPT_FN(_fn)            .type = BCH_OPT_FN, .fn = _fn
-
-#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)        \
-       [Opt_##_name] = {                                               \
-               .attr.name      = #_name,                               \
-               .attr.mode      = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
-               .flags          = _flags,                               \
-               .hint           = _hint,                                \
-               .help           = _help,                                \
-               .get_sb         = type_compatible_or_null(_sb_opt,      *BCH2_NO_SB_OPT),       \
-               .set_sb         = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT),   \
-               .get_member     = type_compatible_or_null(_sb_opt,      *BCH2_NO_MEMBER_OPT),   \
-               .set_member     = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\
-               _type                                                   \
-       },
-
-       BCH_OPTS()
-#undef x
-};
-
-int bch2_opt_lookup(const char *name)
-{
-       const struct bch_option *i;
-
-       for (i = bch2_opt_table;
-            i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
-            i++)
-               if (!strcmp(name, i->attr.name))
-                       return i - bch2_opt_table;
-
-       return -1;
-}
-
-struct opt_synonym {
-       const char      *s1, *s2;
-};
-
-static const struct opt_synonym bch2_opt_synonyms[] = {
-       { "quota",      "usrquota" },
-};
-
-static int bch2_mount_opt_lookup(const char *name)
-{
-       const struct opt_synonym *i;
-
-       for (i = bch2_opt_synonyms;
-            i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms);
-            i++)
-               if (!strcmp(name, i->s1))
-                       name = i->s2;
-
-       return bch2_opt_lookup(name);
-}
-
-struct opt_val_synonym {
-       const char      *opt, *v1, *v2;
-};
-
-static const struct opt_val_synonym bch2_opt_val_synonyms[] = {
-       { "degraded",   "true",         "yes" },
-       { "degraded",   "false",        "no"  },
-       { "degraded",   "1",            "yes" },
-       { "degraded",   "0",            "no"  },
-};
-
-static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val)
-{
-       const struct opt_val_synonym *i;
-
-       for (i = bch2_opt_val_synonyms;
-            i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms);
-            i++)
-               if (!strcmp(opt, i->opt) && !strcmp(val, i->v1))
-                       return i->v2;
-
-       return val;
-}
-
-int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
-{
-       if (v < opt->min) {
-               if (err)
-                       prt_printf(err, "%s: too small (min %llu)",
-                              opt->attr.name, opt->min);
-               return -BCH_ERR_ERANGE_option_too_small;
-       }
-
-       if (opt->max && v >= opt->max) {
-               if (err)
-                       prt_printf(err, "%s: too big (max %llu)",
-                              opt->attr.name, opt->max);
-               return -BCH_ERR_ERANGE_option_too_big;
-       }
-
-       if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
-               if (err)
-                       prt_printf(err, "%s: not a multiple of 512",
-                              opt->attr.name);
-               return -BCH_ERR_opt_parse_error;
-       }
-
-       if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
-               if (err)
-                       prt_printf(err, "%s: must be a power of two",
-                              opt->attr.name);
-               return -BCH_ERR_opt_parse_error;
-       }
-
-       if (opt->fn.validate)
-               return opt->fn.validate(v, err);
-
-       return 0;
-}
-
-int bch2_opt_parse(struct bch_fs *c,
-                  const struct bch_option *opt,
-                  const char *val, u64 *res,
-                  struct printbuf *err)
-{
-       ssize_t ret;
-
-       if (err)
-               printbuf_indent_add_nextline(err, 2);
-
-       switch (opt->type) {
-       case BCH_OPT_BOOL:
-               if (!val)
-                       val = "1";
-
-               ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
-               if (ret != -BCH_ERR_option_not_bool) {
-                       *res = ret;
-               } else {
-                       if (err)
-                               prt_printf(err, "%s: must be bool", opt->attr.name);
-                       return ret;
-               }
-               break;
-       case BCH_OPT_UINT:
-               if (!val) {
-                       prt_printf(err, "%s: required value",
-                                  opt->attr.name);
-                       return -EINVAL;
-               }
-
-               if (*val != '-') {
-                       ret = opt->flags & OPT_HUMAN_READABLE
-                           ? bch2_strtou64_h(val, res)
-                           : kstrtou64(val, 10, res);
-               } else {
-                       prt_printf(err, "%s: must be a non-negative number", opt->attr.name);
-                       return -BCH_ERR_option_negative;
-               }
-
-               if (ret < 0) {
-                       if (err)
-                               prt_printf(err, "%s: must be a number",
-                                          opt->attr.name);
-                       return ret;
-               }
-               break;
-       case BCH_OPT_STR:
-               if (!val) {
-                       prt_printf(err, "%s: required value",
-                                  opt->attr.name);
-                       return -EINVAL;
-               }
-
-               ret = match_string(opt->choices, -1, val);
-               if (ret < 0) {
-                       if (err)
-                               prt_printf(err, "%s: invalid selection",
-                                          opt->attr.name);
-                       return ret;
-               }
-
-               *res = ret;
-               break;
-       case BCH_OPT_BITFIELD: {
-               s64 v = bch2_read_flag_list(val, opt->choices);
-               if (v < 0)
-                       return v;
-               *res = v;
-               break;
-       }
-       case BCH_OPT_FN:
-               ret = opt->fn.parse(c, val, res, err);
-
-               if (ret == -BCH_ERR_option_needs_open_fs)
-                       return ret;
-
-               if (ret < 0) {
-                       if (err)
-                               prt_printf(err, "%s: parse error",
-                                          opt->attr.name);
-                       return ret;
-               }
-       }
-
-       return bch2_opt_validate(opt, *res, err);
-}
-
-void bch2_opt_to_text(struct printbuf *out,
-                     struct bch_fs *c, struct bch_sb *sb,
-                     const struct bch_option *opt, u64 v,
-                     unsigned flags)
-{
-       if (flags & OPT_SHOW_MOUNT_STYLE) {
-               if (opt->type == BCH_OPT_BOOL) {
-                       prt_printf(out, "%s%s",
-                              v ? "" : "no",
-                              opt->attr.name);
-                       return;
-               }
-
-               prt_printf(out, "%s=", opt->attr.name);
-       }
-
-       switch (opt->type) {
-       case BCH_OPT_BOOL:
-       case BCH_OPT_UINT:
-               if (opt->flags & OPT_HUMAN_READABLE)
-                       prt_human_readable_u64(out, v);
-               else
-                       prt_printf(out, "%lli", v);
-               break;
-       case BCH_OPT_STR:
-               if (v < opt->min || v >= opt->max)
-                       prt_printf(out, "(invalid option %lli)", v);
-               else if (flags & OPT_SHOW_FULL_LIST)
-                       prt_string_option(out, opt->choices, v);
-               else
-                       prt_str(out, opt->choices[v]);
-               break;
-       case BCH_OPT_BITFIELD:
-               prt_bitflags(out, opt->choices, v);
-               break;
-       case BCH_OPT_FN:
-               opt->fn.to_text(out, c, sb, v);
-               break;
-       default:
-               BUG();
-       }
-}
-
-void bch2_opts_to_text(struct printbuf *out,
-                      struct bch_opts opts,
-                      struct bch_fs *c, struct bch_sb *sb,
-                      unsigned show_mask, unsigned hide_mask,
-                      unsigned flags)
-{
-       bool first = true;
-
-       for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
-               const struct bch_option *opt = &bch2_opt_table[i];
-
-               if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
-                       continue;
-
-               u64 v = bch2_opt_get_by_id(&opts, i);
-               if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
-                       continue;
-
-               if (!first)
-                       prt_char(out, ',');
-               first = false;
-
-               bch2_opt_to_text(out, c, sb, opt, v, flags);
-       }
-}
-
-int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v)
-{
-       int ret = 0;
-
-       switch (id) {
-       case Opt_state:
-               if (ca)
-                       return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED);
-               break;
-
-       case Opt_compression:
-       case Opt_background_compression:
-               ret = bch2_check_set_has_compressed_data(c, v);
-               break;
-       case Opt_erasure_code:
-               if (v)
-                       bch2_check_set_feature(c, BCH_FEATURE_ec);
-               break;
-       default:
-               break;
-       }
-
-       return ret;
-}
-
-int bch2_opts_hooks_pre_set(struct bch_fs *c)
-{
-       for (unsigned i = 0; i < bch2_opts_nr; i++) {
-               int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
-                           struct bch_opts *new_opts, enum bch_opt_id id)
-{
-       switch (id) {
-       case Opt_foreground_target:
-               if (new_opts->foreground_target &&
-                   !new_opts->background_target)
-                       bch2_set_rebalance_needs_scan(c, inum);
-               break;
-       case Opt_compression:
-               if (new_opts->compression &&
-                   !new_opts->background_compression)
-                       bch2_set_rebalance_needs_scan(c, inum);
-               break;
-       case Opt_background_target:
-               if (new_opts->background_target)
-                       bch2_set_rebalance_needs_scan(c, inum);
-               break;
-       case Opt_background_compression:
-               if (new_opts->background_compression)
-                       bch2_set_rebalance_needs_scan(c, inum);
-               break;
-       case Opt_rebalance_enabled:
-               bch2_rebalance_wakeup(c);
-               break;
-       case Opt_copygc_enabled:
-               bch2_copygc_wakeup(c);
-               break;
-       case Opt_discard:
-               if (!ca) {
-                       mutex_lock(&c->sb_lock);
-                       for_each_member_device(c, ca) {
-                               struct bch_member *m =
-                                       bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx);
-                               SET_BCH_MEMBER_DISCARD(m, c->opts.discard);
-                       }
-
-                       bch2_write_super(c);
-                       mutex_unlock(&c->sb_lock);
-               }
-               break;
-       case Opt_version_upgrade:
-               /*
-                * XXX: in the future we'll likely want to do compatible
-                * upgrades at runtime as well, but right now there's nothing
-                * that does that:
-                */
-               if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible)
-                       bch2_sb_upgrade_incompat(c);
-               break;
-       default:
-               break;
-       }
-}
-
-int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
-                            struct printbuf *parse_later,
-                            const char *name, const char *val)
-{
-       struct printbuf err = PRINTBUF;
-       u64 v;
-       int ret, id;
-
-       id = bch2_mount_opt_lookup(name);
-
-       /* Check for the form "noopt", negation of a boolean opt: */
-       if (id < 0 &&
-           !val &&
-           !strncmp("no", name, 2)) {
-               id = bch2_mount_opt_lookup(name + 2);
-               val = "0";
-       }
-
-       /* Unknown options are ignored: */
-       if (id < 0)
-               return 0;
-
-       /* must have a value for synonym lookup - but OPT_FN is weird */
-       if (!val && bch2_opt_table[id].type != BCH_OPT_FN)
-               val = "1";
-
-       val = bch2_opt_val_synonym_lookup(name, val);
-
-       if (!(bch2_opt_table[id].flags & OPT_MOUNT))
-               goto bad_opt;
-
-       if (id == Opt_acl &&
-           !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
-               goto bad_opt;
-
-       if ((id == Opt_usrquota ||
-            id == Opt_grpquota) &&
-           !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
-               goto bad_opt;
-
-       ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
-       if (ret == -BCH_ERR_option_needs_open_fs) {
-               ret = 0;
-
-               if (parse_later) {
-                       prt_printf(parse_later, "%s=%s,", name, val);
-                       if (parse_later->allocation_failure)
-                               ret = -ENOMEM;
-               }
-
-               goto out;
-       }
-
-       if (ret < 0)
-               goto bad_val;
-
-       if (opts)
-               bch2_opt_set_by_id(opts, id, v);
-
-       ret = 0;
-out:
-       printbuf_exit(&err);
-       return ret;
-bad_opt:
-       ret = -BCH_ERR_option_name;
-       goto out;
-bad_val:
-       ret = -BCH_ERR_option_value;
-       goto out;
-}
-
-int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
-                         struct printbuf *parse_later, char *options,
-                         bool ignore_unknown)
-{
-       char *copied_opts, *copied_opts_start;
-       char *opt, *name, *val;
-       int ret = 0;
-
-       if (!options)
-               return 0;
-
-       /*
-        * sys_fsconfig() is now occasionally providing us with option lists
-        * starting with a comma - weird.
-        */
-       if (*options == ',')
-               options++;
-
-       copied_opts = kstrdup(options, GFP_KERNEL);
-       if (!copied_opts)
-               return -ENOMEM;
-       copied_opts_start = copied_opts;
-
-       while ((opt = strsep(&copied_opts, ",")) != NULL) {
-               if (!*opt)
-                       continue;
-
-               name    = strsep(&opt, "=");
-               val     = opt;
-
-               ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
-               if (ret == -BCH_ERR_option_name && ignore_unknown)
-                       ret = 0;
-               if (ret) {
-                       pr_err("Error parsing option %s: %s", name, bch2_err_str(ret));
-                       break;
-               }
-       }
-
-       kfree(copied_opts_start);
-       return ret;
-}
-
-u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx)
-{
-       const struct bch_option *opt = bch2_opt_table + id;
-       u64 v;
-
-       if (dev_idx < 0) {
-               v = opt->get_sb(sb);
-       } else {
-               if (WARN(!bch2_member_exists(sb, dev_idx),
-                        "tried to set device option %s on nonexistent device %i",
-                        opt->attr.name, dev_idx))
-                       return 0;
-
-               struct bch_member m = bch2_sb_member_get(sb, dev_idx);
-               v = opt->get_member(&m);
-       }
-
-       if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
-               --v;
-
-       if (opt->flags & OPT_SB_FIELD_ILOG2)
-               v = 1ULL << v;
-
-       if (opt->flags & OPT_SB_FIELD_SECTORS)
-               v <<= 9;
-
-       return v;
-}
-
-/*
- * Initial options from superblock - here we don't want any options undefined,
- * any options the superblock doesn't specify are set to 0:
- */
-int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
-{
-       for (unsigned id = 0; id < bch2_opts_nr; id++) {
-               const struct bch_option *opt = bch2_opt_table + id;
-
-               if (opt->get_sb)
-                       bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1));
-       }
-
-       return 0;
-}
-
-bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
-                      const struct bch_option *opt, u64 v)
-{
-       bool changed = false;
-
-       if (opt->flags & OPT_SB_FIELD_SECTORS)
-               v >>= 9;
-
-       if (opt->flags & OPT_SB_FIELD_ILOG2)
-               v = ilog2(v);
-
-       if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
-               v++;
-
-       if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) {
-               changed = v != opt->get_sb(sb);
-
-               opt->set_sb(sb, v);
-       }
-
-       if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) {
-               if (WARN(!bch2_member_exists(sb, dev_idx),
-                        "tried to set device option %s on nonexistent device %i",
-                        opt->attr.name, dev_idx))
-                       return false;
-
-               struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
-               changed = v != opt->get_member(m);
-               opt->set_member(m, v);
-       }
-
-       return changed;
-}
-
-bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
-                    const struct bch_option *opt, u64 v)
-{
-       mutex_lock(&c->sb_lock);
-       bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
-       if (changed)
-               bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-       return changed;
-}
-
-/* io opts: */
-
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
-{
-       struct bch_io_opts opts = {
-#define x(_name, _bits)        ._name = src._name,
-       BCH_INODE_OPTS()
-#undef x
-       };
-
-       bch2_io_opts_fixups(&opts);
-       return opts;
-}
-
-bool bch2_opt_is_inode_opt(enum bch_opt_id id)
-{
-       static const enum bch_opt_id inode_opt_list[] = {
-#define x(_name, _bits)        Opt_##_name,
-       BCH_INODE_OPTS()
-#undef x
-       };
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
-               if (inode_opt_list[i] == id)
-                       return true;
-
-       return false;
-}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
deleted file mode 100644 (file)
index 63f8e25..0000000
+++ /dev/null
@@ -1,693 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_OPTS_H
-#define _BCACHEFS_OPTS_H
-
-#include <linux/bug.h>
-#include <linux/log2.h>
-#include <linux/string.h>
-#include <linux/sysfs.h>
-#include "bcachefs_format.h"
-
-struct bch_fs;
-
-extern const char * const bch2_error_actions[];
-extern const char * const bch2_degraded_actions[];
-extern const char * const bch2_fsck_fix_opts[];
-extern const char * const bch2_version_upgrade_opts[];
-extern const char * const bch2_sb_features[];
-extern const char * const bch2_sb_compat[];
-extern const char * const __bch2_btree_ids[];
-extern const char * const __bch2_csum_types[];
-extern const char * const __bch2_csum_opts[];
-extern const char * const __bch2_compression_types[];
-extern const char * const bch2_compression_opts[];
-extern const char * const __bch2_str_hash_types[];
-extern const char * const bch2_str_hash_opts[];
-extern const char * const __bch2_data_types[];
-extern const char * const bch2_member_states[];
-extern const char * const bch2_d_types[];
-
-void bch2_prt_jset_entry_type(struct printbuf *,       enum bch_jset_entry_type);
-void bch2_prt_fs_usage_type(struct printbuf *,         enum bch_fs_usage_type);
-void bch2_prt_data_type(struct printbuf *,             enum bch_data_type);
-void bch2_prt_csum_opt(struct printbuf *,              enum bch_csum_opt);
-void bch2_prt_csum_type(struct printbuf *,             enum bch_csum_type);
-void bch2_prt_compression_type(struct printbuf *,      enum bch_compression_type);
-void bch2_prt_str_hash_type(struct printbuf *,         enum bch_str_hash_type);
-
-static inline const char *bch2_d_type_str(unsigned d_type)
-{
-       return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
-}
-
-/*
- * Mount options; we also store defaults in the superblock.
- *
- * Also exposed via sysfs: if an option is writeable, and it's also stored in
- * the superblock, changing it via sysfs (currently? might change this) also
- * updates the superblock.
- *
- * We store options as signed integers, where -1 means undefined. This means we
- * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
- * apply the options from that struct that are defined.
- */
-
-/* When can be set: */
-enum opt_flags {
-       OPT_FS                  = BIT(0),       /* Filesystem option */
-       OPT_DEVICE              = BIT(1),       /* Device option */
-       OPT_INODE               = BIT(2),       /* Inode option */
-       OPT_FORMAT              = BIT(3),       /* May be specified at format time */
-       OPT_MOUNT               = BIT(4),       /* May be specified at mount time */
-       OPT_RUNTIME             = BIT(5),       /* May be specified at runtime */
-       OPT_HUMAN_READABLE      = BIT(6),
-       OPT_MUST_BE_POW_2       = BIT(7),       /* Must be power of 2 */
-       OPT_SB_FIELD_SECTORS    = BIT(8),       /* Superblock field is >> 9 of actual value */
-       OPT_SB_FIELD_ILOG2      = BIT(9),       /* Superblock field is ilog2 of actual value */
-       OPT_SB_FIELD_ONE_BIAS   = BIT(10),      /* 0 means default value */
-       OPT_HIDDEN              = BIT(11),
-};
-
-enum opt_type {
-       BCH_OPT_BOOL,
-       BCH_OPT_UINT,
-       BCH_OPT_STR,
-       BCH_OPT_BITFIELD,
-       BCH_OPT_FN,
-};
-
-struct bch_opt_fn {
-       int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
-       void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-       int (*validate)(u64, struct printbuf *);
-};
-
-/**
- * x(name, shortopt, type, in mem type, mode, sb_opt)
- *
- * @name       - name of mount option, sysfs attribute, and struct bch_opts
- *               member
- *
- * @mode       - when opt may be set
- *
- * @sb_option  - name of corresponding superblock option
- *
- * @type       - one of OPT_BOOL, OPT_UINT, OPT_STR
- */
-
-/*
- * XXX: add fields for
- *  - default value
- *  - helptext
- */
-
-#ifdef __KERNEL__
-#define RATELIMIT_ERRORS_DEFAULT true
-#else
-#define RATELIMIT_ERRORS_DEFAULT false
-#endif
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCACHEFS_VERBOSE_DEFAULT       true
-#else
-#define BCACHEFS_VERBOSE_DEFAULT       false
-#endif
-
-#define BCH_FIX_ERRORS_OPTS()          \
-       x(exit, 0)                      \
-       x(yes,  1)                      \
-       x(no,   2)                      \
-       x(ask,  3)
-
-enum fsck_err_opts {
-#define x(t, n)        FSCK_FIX_##t,
-       BCH_FIX_ERRORS_OPTS()
-#undef x
-};
-
-#define BCH_OPTS()                                                     \
-       x(block_size,                   u16,                            \
-         OPT_FS|OPT_FORMAT|                                            \
-         OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,    \
-         OPT_UINT(512, 1U << 16),                                      \
-         BCH_SB_BLOCK_SIZE,            4 << 10,                        \
-         "size",       NULL)                                           \
-       x(btree_node_size,              u32,                            \
-         OPT_FS|OPT_FORMAT|                                            \
-         OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,    \
-         OPT_UINT(512, 1U << 20),                                      \
-         BCH_SB_BTREE_NODE_SIZE,       256 << 10,                      \
-         "size",       "Btree node size, default 256k")                \
-       x(errors,                       u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_STR(bch2_error_actions),                                  \
-         BCH_SB_ERROR_ACTION,          BCH_ON_ERROR_fix_safe,          \
-         NULL,         "Action to take on filesystem error")           \
-       x(write_error_timeout,          u16,                            \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_UINT(1, 300),                                             \
-         BCH_SB_WRITE_ERROR_TIMEOUT,   30,                             \
-         NULL,         "Number of consecutive write errors allowed before kicking out a device")\
-       x(metadata_replicas,            u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_UINT(1, BCH_REPLICAS_MAX),                                \
-         BCH_SB_META_REPLICAS_WANT,    1,                              \
-         "#",          "Number of metadata replicas")                  \
-       x(data_replicas,                u8,                             \
-         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
-         OPT_UINT(1, BCH_REPLICAS_MAX),                                \
-         BCH_SB_DATA_REPLICAS_WANT,    1,                              \
-         "#",          "Number of data replicas")                      \
-       x(metadata_replicas_required, u8,                               \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
-         OPT_UINT(1, BCH_REPLICAS_MAX),                                \
-         BCH_SB_META_REPLICAS_REQ,     1,                              \
-         "#",          NULL)                                           \
-       x(data_replicas_required,       u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
-         OPT_UINT(1, BCH_REPLICAS_MAX),                                \
-         BCH_SB_DATA_REPLICAS_REQ,     1,                              \
-         "#",          NULL)                                           \
-       x(encoded_extent_max,           u32,                            \
-         OPT_FS|OPT_FORMAT|                                            \
-         OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
-         OPT_UINT(4096, 2U << 20),                                     \
-         BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10,                     \
-         "size",       "Maximum size of checksummed/compressed extents")\
-       x(metadata_checksum,            u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_STR(__bch2_csum_opts),                                    \
-         BCH_SB_META_CSUM_TYPE,        BCH_CSUM_OPT_crc32c,            \
-         NULL,         NULL)                                           \
-       x(data_checksum,                u8,                             \
-         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
-         OPT_STR(__bch2_csum_opts),                                    \
-         BCH_SB_DATA_CSUM_TYPE,        BCH_CSUM_OPT_crc32c,            \
-         NULL,         NULL)                                           \
-       x(checksum_err_retry_nr,        u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_UINT(0, 32),                                              \
-         BCH_SB_CSUM_ERR_RETRY_NR,     3,                              \
-         NULL,         NULL)                                           \
-       x(compression,                  u8,                             \
-         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
-         OPT_FN(bch2_opt_compression),                                 \
-         BCH_SB_COMPRESSION_TYPE,      BCH_COMPRESSION_OPT_none,       \
-         NULL,         NULL)                                           \
-       x(background_compression,       u8,                             \
-         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
-         OPT_FN(bch2_opt_compression),                                 \
-         BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,  \
-         NULL,         NULL)                                           \
-       x(str_hash,                     u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_STR(bch2_str_hash_opts),                                  \
-         BCH_SB_STR_HASH_TYPE,         BCH_STR_HASH_OPT_siphash,       \
-         NULL,         "Hash function for directory entries and xattrs")\
-       x(metadata_target,              u16,                            \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_FN(bch2_opt_target),                                      \
-         BCH_SB_METADATA_TARGET,       0,                              \
-         "(target)",   "Device or label for metadata writes")          \
-       x(foreground_target,            u16,                            \
-         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
-         OPT_FN(bch2_opt_target),                                      \
-         BCH_SB_FOREGROUND_TARGET,     0,                              \
-         "(target)",   "Device or label for foreground writes")        \
-       x(background_target,            u16,                            \
-         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
-         OPT_FN(bch2_opt_target),                                      \
-         BCH_SB_BACKGROUND_TARGET,     0,                              \
-         "(target)",   "Device or label to move data to in the background")\
-       x(promote_target,               u16,                            \
-         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
-         OPT_FN(bch2_opt_target),                                      \
-         BCH_SB_PROMOTE_TARGET,        0,                              \
-         "(target)",   "Device or label to promote data to on read")   \
-       x(erasure_code,                 u16,                            \
-         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
-         OPT_BOOL(),                                                   \
-         BCH_SB_ERASURE_CODE,          false,                          \
-         NULL,         "Enable erasure coding (DO NOT USE YET)")       \
-       x(casefold,                     u8,                             \
-         OPT_FS|OPT_INODE|OPT_FORMAT,                                  \
-         OPT_BOOL(),                                                   \
-         BCH_SB_CASEFOLD,              false,                          \
-         NULL,         "Dirent lookups are casefolded")                \
-       x(casefold_disabled,                    u8,                     \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Disable casefolding filesystem wide")          \
-       x(inodes_32bit,                 u8,                             \
-         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
-         OPT_BOOL(),                                                   \
-         BCH_SB_INODE_32BIT,           true,                           \
-         NULL,         "Constrain inode numbers to 32 bits")           \
-       x(shard_inode_numbers_bits,     u8,                             \
-         OPT_FS|OPT_FORMAT,                                            \
-         OPT_UINT(0, 8),                                               \
-         BCH_SB_SHARD_INUMS_NBITS,     0,                              \
-         NULL,         "Shard new inode numbers by CPU id")            \
-       x(inodes_use_key_cache, u8,                                     \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
-         OPT_BOOL(),                                                   \
-         BCH_SB_INODES_USE_KEY_CACHE,  true,                           \
-         NULL,         "Use the btree key cache for the inodes btree") \
-       x(btree_node_mem_ptr_optimization, u8,                          \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               true,                           \
-         NULL,         "Stash pointer to in memory btree node in btree ptr")\
-       x(gc_reserve_percent,           u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_UINT(5, 21),                                              \
-         BCH_SB_GC_RESERVE,            8,                              \
-         "%",          "Percentage of disk space to reserve for copygc")\
-       x(gc_reserve_bytes,             u64,                            \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|                      \
-         OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,                      \
-         OPT_UINT(0, U64_MAX),                                         \
-         BCH_SB_GC_RESERVE_BYTES,      0,                              \
-         "%",          "Amount of disk space to reserve for copygc\n"  \
-                       "Takes precedence over gc_reserve_percent if set")\
-       x(root_reserve_percent,         u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
-         OPT_UINT(0, 100),                                             \
-         BCH_SB_ROOT_RESERVE,          0,                              \
-         "%",          "Percentage of disk space to reserve for superuser")\
-       x(wide_macs,                    u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_BOOL(),                                                   \
-         BCH_SB_128_BIT_MACS,          false,                          \
-         NULL,         "Store full 128 bits of cryptographic MACs, instead of 80")\
-       x(inline_data,                  u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               true,                           \
-         NULL,         "Enable inline data extents")                   \
-       x(promote_whole_extents,        u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_BOOL(),                                                   \
-         BCH_SB_PROMOTE_WHOLE_EXTENTS, true,                           \
-         NULL,         "Promote whole extents, instead of just part being read")\
-       x(acl,                          u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
-         OPT_BOOL(),                                                   \
-         BCH_SB_POSIX_ACL,             true,                           \
-         NULL,         "Enable POSIX acls")                            \
-       x(usrquota,                     u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
-         OPT_BOOL(),                                                   \
-         BCH_SB_USRQUOTA,              false,                          \
-         NULL,         "Enable user quotas")                           \
-       x(grpquota,                     u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
-         OPT_BOOL(),                                                   \
-         BCH_SB_GRPQUOTA,              false,                          \
-         NULL,         "Enable group quotas")                          \
-       x(prjquota,                     u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
-         OPT_BOOL(),                                                   \
-         BCH_SB_PRJQUOTA,              false,                          \
-         NULL,         "Enable project quotas")                        \
-       x(degraded,                     u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_STR(bch2_degraded_actions),                               \
-         BCH_SB_DEGRADED_ACTION,       BCH_DEGRADED_ask,               \
-         NULL,         "Allow mounting in degraded mode")              \
-       x(no_splitbrain_check,          u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Don't kick drives out when splitbrain detected")\
-       x(verbose,                      u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               BCACHEFS_VERBOSE_DEFAULT,       \
-         NULL,         "Extra debugging information during mount/recovery")\
-       x(journal_flush_delay,          u32,                            \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_UINT(1, U32_MAX),                                         \
-         BCH_SB_JOURNAL_FLUSH_DELAY,   1000,                           \
-         NULL,         "Delay in milliseconds before automatic journal commits")\
-       x(journal_flush_disabled,       u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_BOOL(),                                                   \
-         BCH_SB_JOURNAL_FLUSH_DISABLED,false,                          \
-         NULL,         "Disable journal flush on sync/fsync\n"         \
-                       "If enabled, writes can be lost, but only since the\n"\
-                       "last journal write (default 1 second)")        \
-       x(journal_reclaim_delay,        u32,                            \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_UINT(0, U32_MAX),                                         \
-         BCH_SB_JOURNAL_RECLAIM_DELAY, 100,                            \
-         NULL,         "Delay in milliseconds before automatic journal reclaim")\
-       x(move_bytes_in_flight,         u32,                            \
-         OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,              \
-         OPT_UINT(1024, U32_MAX),                                      \
-         BCH2_NO_SB_OPT,               1U << 20,                       \
-         NULL,         "Maximum Amount of IO to keep in flight by the move path")\
-       x(move_ios_in_flight,           u32,                            \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_UINT(1, 1024),                                            \
-         BCH2_NO_SB_OPT,               32,                             \
-         NULL,         "Maximum number of IOs to keep in flight by the move path")\
-       x(fsck,                         u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Run fsck on mount")                            \
-       x(fsck_memory_usage_percent,    u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_UINT(20, 70),                                             \
-         BCH2_NO_SB_OPT,               50,                             \
-         NULL,         "Maximum percentage of system ram fsck is allowed to pin")\
-       x(fix_errors,                   u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_FN(bch2_opt_fix_errors),                                  \
-         BCH2_NO_SB_OPT,               FSCK_FIX_exit,                  \
-         NULL,         "Fix errors during fsck without asking")        \
-       x(ratelimit_errors,             u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               RATELIMIT_ERRORS_DEFAULT,       \
-         NULL,         "Ratelimit error messages during fsck")         \
-       x(nochanges,                    u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Super read only mode - no writes at all will be issued,\n"\
-                       "even if we have to replay the journal")        \
-       x(norecovery,                   u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Exit recovery immediately prior to journal replay")\
-       x(journal_rewind,               u64,                            \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_UINT(0, U64_MAX),                                         \
-         BCH2_NO_SB_OPT,               0,                              \
-         NULL,         "Rewind journal")                               \
-       x(recovery_passes,              u64,                            \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BITFIELD(bch2_recovery_passes),                           \
-         BCH2_NO_SB_OPT,               0,                              \
-         NULL,         "Recovery passes to run explicitly")            \
-       x(recovery_passes_exclude,      u64,                            \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BITFIELD(bch2_recovery_passes),                           \
-         BCH2_NO_SB_OPT,               0,                              \
-         NULL,         "Recovery passes to exclude")                   \
-       x(recovery_pass_last,           u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_STR_NOLIMIT(bch2_recovery_passes),                        \
-         BCH2_NO_SB_OPT,               0,                              \
-         NULL,         "Exit recovery after specified pass")           \
-       x(retain_recovery_info,         u8,                             \
-         0,                                                            \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Don't free journal entries/keys, scanned btree nodes after startup")\
-       x(read_entire_journal,          u8,                             \
-         0,                                                            \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Read all journal entries, not just dirty ones")\
-       x(read_journal_only,            u8,                             \
-         0,                                                            \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Only read the journal, skip the rest of recovery")\
-       x(journal_transaction_names,    u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_BOOL(),                                                   \
-         BCH_SB_JOURNAL_TRANSACTION_NAMES, true,                       \
-         NULL,         "Log transaction function names in journal")    \
-       x(allocator_stuck_timeout,      u16,                            \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_UINT(0, U16_MAX),                                         \
-         BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30,                           \
-         NULL,         "Default timeout in seconds for stuck allocator messages")\
-       x(noexcl,                       u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Don't open device in exclusive mode")          \
-       x(direct_io,                    u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,                       true,                   \
-         NULL,         "Use O_DIRECT (userspace only)")                \
-       x(sb,                           u64,                            \
-         OPT_MOUNT,                                                    \
-         OPT_UINT(0, S64_MAX),                                         \
-         BCH2_NO_SB_OPT,               BCH_SB_SECTOR,                  \
-         "offset",     "Sector offset of superblock")                  \
-       x(read_only,                    u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_HIDDEN,                                  \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         NULL)                                           \
-       x(nostart,                      u8,                             \
-         0,                                                            \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Don\'t start filesystem, only open devices")   \
-       x(reconstruct_alloc,            u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Reconstruct alloc btree")                      \
-       x(version_upgrade,              u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_STR(bch2_version_upgrade_opts),                           \
-         BCH_SB_VERSION_UPGRADE,       BCH_VERSION_UPGRADE_compatible, \
-         NULL,         "Set superblock to latest version,\n"           \
-                       "allowing any new features to be used")         \
-       x(stdio,                        u64,                            \
-         0,                                                            \
-         OPT_UINT(0, S64_MAX),                                         \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Pointer to a struct stdio_redirect")           \
-       x(project,                      u8,                             \
-         OPT_INODE,                                                    \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         NULL)                                           \
-       x(nocow,                        u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,            \
-         OPT_BOOL(),                                                   \
-         BCH_SB_NOCOW,                 false,                          \
-         NULL,         "Nocow mode: Writes will be done in place when possible.\n"\
-                       "Snapshots and reflink will still caused writes to be COW\n"\
-                       "Implicitly disables data checksumming, compression and encryption")\
-       x(nocow_enabled,                u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,                       true,                   \
-         NULL,         "Enable nocow mode: enables runtime locking in\n"\
-                       "data move path needed if nocow will ever be in use\n")\
-       x(copygc_enabled,               u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,                       true,                   \
-         NULL,         "Enable copygc: disable for debugging, or to\n"\
-                       "quiet the system when doing performance testing\n")\
-       x(rebalance_enabled,            u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,                       true,                   \
-         NULL,         "Enable rebalance: disable for debugging, or to\n"\
-                       "quiet the system when doing performance testing\n")\
-       x(rebalance_on_ac_only,         u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_BOOL(),                                                   \
-         BCH_SB_REBALANCE_AC_ONLY,             false,                  \
-         NULL,         "Enable rebalance while on mains power only\n") \
-       x(auto_snapshot_deletion,       u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,                       true,                   \
-         NULL,         "Enable automatic snapshot deletion: disable for debugging, or to\n"\
-                       "quiet the system when doing performance testing\n")\
-       x(no_data_io,                   u8,                             \
-         OPT_MOUNT,                                                    \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
-         NULL,         "Skip submit_bio() for data reads and writes, " \
-                       "for performance testing purposes")             \
-       x(state,                        u64,                            \
-         OPT_DEVICE|OPT_RUNTIME,                                       \
-         OPT_STR(bch2_member_states),                                  \
-         BCH_MEMBER_STATE,             BCH_MEMBER_STATE_rw,            \
-         "state",      "rw,ro,failed,spare")                           \
-       x(bucket_size,                  u32,                            \
-         OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,           \
-         OPT_UINT(0, S64_MAX),                                         \
-         BCH_MEMBER_BUCKET_SIZE,       0,                              \
-         "size",       "Specifies the bucket size; must be greater than the btree node size")\
-       x(durability,                   u8,                             \
-         OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS,                 \
-         OPT_UINT(0, BCH_REPLICAS_MAX),                                \
-         BCH_MEMBER_DURABILITY,        1,                              \
-         "n",          "Data written to this device will be considered\n"\
-                       "to have already been replicated n times")      \
-       x(data_allowed,                 u8,                             \
-         OPT_DEVICE,                                                   \
-         OPT_BITFIELD(__bch2_data_types),                              \
-         BCH_MEMBER_DATA_ALLOWED,      BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
-         "types",      "Allowed data types for this device: journal, btree, and/or user")\
-       x(discard,                      u8,                             \
-         OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME,                      \
-         OPT_BOOL(),                                                   \
-         BCH_MEMBER_DISCARD,           true,                           \
-         NULL,         "Enable discard/TRIM support")                  \
-       x(btree_node_prefetch,          u8,                             \
-         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               true,                           \
-         NULL,         "BTREE_ITER_prefetch causes btree nodes to be\n"\
-         " prefetched sequentially")
-
-struct bch_opts {
-#define x(_name, _bits, ...)   unsigned _name##_defined:1;
-       BCH_OPTS()
-#undef x
-
-#define x(_name, _bits, ...)   _bits   _name;
-       BCH_OPTS()
-#undef x
-};
-
-struct bch2_opts_parse {
-       struct bch_opts opts;
-
-       /* to save opts that can't be parsed before the FS is opened: */
-       struct printbuf parse_later;
-};
-
-static const __maybe_unused struct bch_opts bch2_opts_default = {
-#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)          \
-       ._name##_defined = true,                                        \
-       ._name = _default,                                              \
-
-       BCH_OPTS()
-#undef x
-};
-
-#define opt_defined(_opts, _name)      ((_opts)._name##_defined)
-
-#define opt_get(_opts, _name)                                          \
-       (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
-
-#define opt_set(_opts, _name, _v)                                      \
-do {                                                                   \
-       (_opts)._name##_defined = true;                                 \
-       (_opts)._name = _v;                                             \
-} while (0)
-
-static inline struct bch_opts bch2_opts_empty(void)
-{
-       return (struct bch_opts) { 0 };
-}
-
-void bch2_opts_apply(struct bch_opts *, struct bch_opts);
-
-enum bch_opt_id {
-#define x(_name, ...)  Opt_##_name,
-       BCH_OPTS()
-#undef x
-       bch2_opts_nr
-};
-
-struct bch_fs;
-struct printbuf;
-
-struct bch_option {
-       struct attribute        attr;
-       enum opt_type           type;
-       enum opt_flags          flags;
-       u64                     min, max;
-
-       const char * const *choices;
-
-       struct bch_opt_fn       fn;
-
-       const char              *hint;
-       const char              *help;
-
-       u64                     (*get_sb)(const struct bch_sb *);
-       void                    (*set_sb)(struct bch_sb *, u64);
-
-       u64                     (*get_member)(const struct bch_member *);
-       void                    (*set_member)(struct bch_member *, u64);
-
-};
-
-extern const struct bch_option bch2_opt_table[];
-
-bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
-u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
-void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
-
-u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int);
-int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
-bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
-
-struct bch_dev;
-bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
-
-int bch2_opt_lookup(const char *);
-int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
-int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
-                  const char *, u64 *, struct printbuf *);
-
-#define OPT_SHOW_FULL_LIST     (1 << 0)
-#define OPT_SHOW_MOUNT_STYLE   (1 << 1)
-
-void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
-                     const struct bch_option *, u64, unsigned);
-void bch2_opts_to_text(struct printbuf *,
-                      struct bch_opts,
-                      struct bch_fs *, struct bch_sb *,
-                      unsigned, unsigned, unsigned);
-
-int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64);
-int bch2_opts_hooks_pre_set(struct bch_fs *);
-void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64,
-                           struct bch_opts *, enum bch_opt_id);
-
-int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
-                            struct printbuf *, const char *, const char *);
-int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
-                         char *, bool);
-
-/* inode opts: */
-
-struct bch_io_opts {
-#define x(_name, _bits)        u##_bits _name;
-       BCH_INODE_OPTS()
-#undef x
-#define x(_name, _bits)        u64 _name##_from_inode:1;
-       BCH_INODE_OPTS()
-#undef x
-};
-
-static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
-{
-       if (!opts->background_target)
-               opts->background_target = opts->foreground_target;
-       if (!opts->background_compression)
-               opts->background_compression = opts->compression;
-       if (opts->nocow) {
-               opts->compression = opts->background_compression = 0;
-               opts->data_checksum = 0;
-               opts->erasure_code = 0;
-       }
-}
-
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
-bool bch2_opt_is_inode_opt(enum bch_opt_id);
-
-#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
deleted file mode 100644 (file)
index 3302bbc..0000000
+++ /dev/null
@@ -1,528 +0,0 @@
-// SPDX-License-Identifier: LGPL-2.1+
-/* Copyright (C) 2022 Kent Overstreet */
-
-#include <linux/bitmap.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/string_helpers.h>
-
-#include "printbuf.h"
-
-static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos)
-{
-       return pos - buf->last_newline;
-}
-
-static inline unsigned printbuf_linelen(struct printbuf *buf)
-{
-       return __printbuf_linelen(buf, buf->pos);
-}
-
-/*
- * Returns spaces from start of line, if set, or 0 if unset:
- */
-static inline unsigned cur_tabstop(struct printbuf *buf)
-{
-       return buf->cur_tabstop < buf->nr_tabstops
-               ? buf->_tabstops[buf->cur_tabstop]
-               : 0;
-}
-
-int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
-{
-       /* Reserved space for terminating nul: */
-       extra += 1;
-
-       if (out->pos + extra <= out->size)
-               return 0;
-
-       if (!out->heap_allocated) {
-               out->overflow = true;
-               return 0;
-       }
-
-       unsigned new_size = roundup_pow_of_two(out->size + extra);
-
-       /* Sanity check... */
-       if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) {
-               out->allocation_failure = true;
-               out->overflow = true;
-               return -ENOMEM;
-       }
-
-       /*
-        * Note: output buffer must be freeable with kfree(), it's not required
-        * that the user use printbuf_exit().
-        */
-       char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
-
-       if (!buf) {
-               out->allocation_failure = true;
-               out->overflow = true;
-               return -ENOMEM;
-       }
-
-       out->buf        = buf;
-       out->size       = new_size;
-       return 0;
-}
-
-static void printbuf_advance_pos(struct printbuf *out, unsigned len)
-{
-       out->pos += min(len, printbuf_remaining(out));
-}
-
-static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr)
-{
-       unsigned move = out->pos - pos;
-
-       bch2_printbuf_make_room(out, nr);
-
-       if (pos + nr < out->size)
-               memmove(out->buf + pos + nr,
-                       out->buf + pos,
-                       min(move, out->size - 1 - pos - nr));
-
-       if (pos < out->size)
-               memset(out->buf + pos, ' ', min(nr, out->size - pos));
-
-       printbuf_advance_pos(out, nr);
-       printbuf_nul_terminate_reserved(out);
-}
-
-static void __printbuf_do_indent(struct printbuf *out, unsigned pos)
-{
-       while (true) {
-               int pad;
-               unsigned len = out->pos - pos;
-               char *p = out->buf + pos;
-               char *n = memscan(p, '\n', len);
-               if (cur_tabstop(out)) {
-                       n = min(n, (char *) memscan(p, '\r', len));
-                       n = min(n, (char *) memscan(p, '\t', len));
-               }
-
-               pos = n - out->buf;
-               if (pos == out->pos)
-                       break;
-
-               switch (*n) {
-               case '\n':
-                       pos++;
-                       out->last_newline = pos;
-
-                       printbuf_insert_spaces(out, pos, out->indent);
-
-                       pos = min(pos + out->indent, out->pos);
-                       out->last_field = pos;
-                       out->cur_tabstop = 0;
-                       break;
-               case '\r':
-                       memmove(n, n + 1, out->pos - pos);
-                       --out->pos;
-                       pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos);
-                       if (pad > 0) {
-                               printbuf_insert_spaces(out, out->last_field, pad);
-                               pos += pad;
-                       }
-
-                       out->last_field = pos;
-                       out->cur_tabstop++;
-                       break;
-               case '\t':
-                       pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1;
-                       if (pad > 0) {
-                               *n = ' ';
-                               printbuf_insert_spaces(out, pos, pad - 1);
-                               pos += pad;
-                       } else {
-                               memmove(n, n + 1, out->pos - pos);
-                               --out->pos;
-                       }
-
-                       out->last_field = pos;
-                       out->cur_tabstop++;
-                       break;
-               }
-       }
-}
-
-static inline void printbuf_do_indent(struct printbuf *out, unsigned pos)
-{
-       if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling)
-               __printbuf_do_indent(out, pos);
-}
-
-void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
-{
-       int len;
-
-       do {
-               va_list args2;
-
-               va_copy(args2, args);
-               len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2);
-               va_end(args2);
-       } while (len > printbuf_remaining(out) &&
-                !bch2_printbuf_make_room(out, len));
-
-       unsigned indent_pos = out->pos;
-       printbuf_advance_pos(out, len);
-       printbuf_do_indent(out, indent_pos);
-}
-
-void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
-{
-       va_list args;
-       int len;
-
-       do {
-               va_start(args, fmt);
-               len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args);
-               va_end(args);
-       } while (len > printbuf_remaining(out) &&
-                !bch2_printbuf_make_room(out, len));
-
-       unsigned indent_pos = out->pos;
-       printbuf_advance_pos(out, len);
-       printbuf_do_indent(out, indent_pos);
-}
-
-/**
- * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
- * null terminated
- * @buf:       printbuf to terminate
- * Returns:    Printbuf contents, as a nul terminated C string
- */
-const char *bch2_printbuf_str(const struct printbuf *buf)
-{
-       /*
-        * If we've written to a printbuf then it's guaranteed to be a null
-        * terminated string - but if we haven't, then we might not have
-        * allocated a buffer at all:
-        */
-       return buf->pos
-               ? buf->buf
-               : "";
-}
-
-/**
- * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
- * against accidental use.
- * @buf:       printbuf to exit
- */
-void bch2_printbuf_exit(struct printbuf *buf)
-{
-       if (buf->heap_allocated) {
-               kfree(buf->buf);
-               buf->buf = ERR_PTR(-EINTR); /* poison value */
-       }
-}
-
-void bch2_printbuf_tabstops_reset(struct printbuf *buf)
-{
-       buf->nr_tabstops = 0;
-}
-
-void bch2_printbuf_tabstop_pop(struct printbuf *buf)
-{
-       if (buf->nr_tabstops)
-               --buf->nr_tabstops;
-}
-
-/*
- * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
- *
- * @buf: printbuf to control
- * @spaces: number of spaces from previous tabpstop
- *
- * In the future this function may allocate memory if setting more than
- * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
- * of line.
- */
-int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
-{
-       unsigned prev_tabstop = buf->nr_tabstops
-               ? buf->_tabstops[buf->nr_tabstops - 1]
-               : 0;
-
-       if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
-               return -EINVAL;
-
-       buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
-       buf->has_indent_or_tabstops = true;
-       return 0;
-}
-
-/**
- * bch2_printbuf_indent_add() - add to the current indent level
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to add to the current indent level
- *
- * Subsequent lines, and the current line if the output position is at the start
- * of the current line, will be indented by @spaces more spaces.
- */
-void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
-{
-       if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
-               spaces = 0;
-
-       buf->indent += spaces;
-       prt_chars(buf, ' ', spaces);
-
-       buf->has_indent_or_tabstops = true;
-}
-
-/**
- * bch2_printbuf_indent_add_nextline() - add to the current indent level for
- * subsequent lines
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to add to the current indent level
- *
- * Subsequent lines - not the current line - will be indented by @spaces more
- * spaces.
- */
-void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces)
-{
-       if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
-               spaces = 0;
-
-       buf->indent += spaces;
-       buf->has_indent_or_tabstops = true;
-}
-
-/**
- * bch2_printbuf_indent_sub() - subtract from the current indent level
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to subtract from the current indent level
- *
- * Subsequent lines, and the current line if the output position is at the start
- * of the current line, will be indented by @spaces less spaces.
- */
-void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
-{
-       if (WARN_ON_ONCE(spaces > buf->indent))
-               spaces = buf->indent;
-
-       if (buf->last_newline + buf->indent == buf->pos) {
-               buf->pos -= spaces;
-               printbuf_nul_terminate(buf);
-       }
-       buf->indent -= spaces;
-
-       if (!buf->indent && !buf->nr_tabstops)
-               buf->has_indent_or_tabstops = false;
-}
-
-void bch2_prt_newline(struct printbuf *buf)
-{
-       bch2_printbuf_make_room(buf, 1 + buf->indent);
-
-       __prt_char_reserved(buf, '\n');
-
-       buf->last_newline       = buf->pos;
-
-       __prt_chars_reserved(buf, ' ', buf->indent);
-
-       printbuf_nul_terminate_reserved(buf);
-
-       buf->last_field         = buf->pos;
-       buf->cur_tabstop        = 0;
-}
-
-void bch2_printbuf_strip_trailing_newline(struct printbuf *out)
-{
-       for (int p = out->pos - 1; p >= 0; --p) {
-               if (out->buf[p] == '\n') {
-                       out->pos = p;
-                       break;
-               }
-               if (out->buf[p] != ' ')
-                       break;
-       }
-
-       printbuf_nul_terminate_reserved(out);
-}
-
-static void __prt_tab(struct printbuf *out)
-{
-       int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
-
-       prt_chars(out, ' ', spaces);
-
-       out->last_field = out->pos;
-       out->cur_tabstop++;
-}
-
-/**
- * bch2_prt_tab() - Advance printbuf to the next tabstop
- * @out:       printbuf to control
- *
- * Advance output to the next tabstop by printing spaces.
- */
-void bch2_prt_tab(struct printbuf *out)
-{
-       if (WARN_ON(!cur_tabstop(out)))
-               return;
-
-       __prt_tab(out);
-}
-
-static void __prt_tab_rjust(struct printbuf *buf)
-{
-       int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
-       if (pad > 0)
-               printbuf_insert_spaces(buf, buf->last_field, pad);
-
-       buf->last_field = buf->pos;
-       buf->cur_tabstop++;
-}
-
-/**
- * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
- * previous output
- *
- * @buf: printbuf to control
- *
- * Advance output to the next tabstop by inserting spaces immediately after the
- * previous tabstop, right justifying previously outputted text.
- */
-void bch2_prt_tab_rjust(struct printbuf *buf)
-{
-       if (WARN_ON(!cur_tabstop(buf)))
-               return;
-
-       __prt_tab_rjust(buf);
-}
-
-/**
- * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
- *
- * @out:       output printbuf
- * @str:       string to print
- * @count:     number of bytes to print
- *
- * The following contol characters are handled as so:
- *   \n: prt_newline   newline that obeys current indent level
- *   \t: prt_tab       advance to next tabstop
- *   \r: prt_tab_rjust advance to next tabstop, with right justification
- */
-void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
-{
-       unsigned indent_pos = out->pos;
-       prt_bytes(out, str, count);
-       printbuf_do_indent(out, indent_pos);
-}
-
-/**
- * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
- * @out:       output printbuf
- * @v:         integer to print
- *
- * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
- */
-void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
-{
-       bch2_printbuf_make_room(out, 10);
-       unsigned len = string_get_size(v, 1, !out->si_units,
-                                      out->buf + out->pos,
-                                      printbuf_remaining_size(out));
-       printbuf_advance_pos(out, len);
-}
-
-/**
- * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
- * @out:       output printbuf
- * @v:         integer to print
- *
- * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
- */
-void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
-{
-       if (v < 0)
-               prt_char(out, '-');
-       bch2_prt_human_readable_u64(out, abs(v));
-}
-
-/**
- * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
- * @out:       output printbuf
- * @v:         integer to print
- *
- * Units are either raw (default), or human reabable units (controlled via
- * @buf->human_readable_units)
- */
-void bch2_prt_units_u64(struct printbuf *out, u64 v)
-{
-       if (out->human_readable_units)
-               bch2_prt_human_readable_u64(out, v);
-       else
-               bch2_prt_printf(out, "%llu", v);
-}
-
-/**
- * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
- * @out:       output printbuf
- * @v:         integer to print
- *
- * Units are either raw (default), or human reabable units (controlled via
- * @buf->human_readable_units)
- */
-void bch2_prt_units_s64(struct printbuf *out, s64 v)
-{
-       if (v < 0)
-               prt_char(out, '-');
-       bch2_prt_units_u64(out, abs(v));
-}
-
-void bch2_prt_string_option(struct printbuf *out,
-                           const char * const list[],
-                           size_t selected)
-{
-       for (size_t i = 0; list[i]; i++)
-               bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
-}
-
-void bch2_prt_bitflags(struct printbuf *out,
-                      const char * const list[], u64 flags)
-{
-       unsigned bit, nr = 0;
-       bool first = true;
-
-       while (list[nr])
-               nr++;
-
-       while (flags && (bit = __ffs64(flags)) < nr) {
-               if (!first)
-                       bch2_prt_printf(out, ",");
-               first = false;
-               bch2_prt_printf(out, "%s", list[bit]);
-               flags ^= BIT_ULL(bit);
-       }
-}
-
-void bch2_prt_bitflags_vector(struct printbuf *out,
-                             const char * const list[],
-                             unsigned long *v, unsigned nr)
-{
-       bool first = true;
-       unsigned i;
-
-       for (i = 0; i < nr; i++)
-               if (!list[i]) {
-                       nr = i - 1;
-                       break;
-               }
-
-       for_each_set_bit(i, v, nr) {
-               if (!first)
-                       bch2_prt_printf(out, ",");
-               first = false;
-               bch2_prt_printf(out, "%s", list[i]);
-       }
-}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
deleted file mode 100644 (file)
index 8f4e28d..0000000
+++ /dev/null
@@ -1,298 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1+ */
-/* Copyright (C) 2022 Kent Overstreet */
-
-#ifndef _BCACHEFS_PRINTBUF_H
-#define _BCACHEFS_PRINTBUF_H
-
-/*
- * Printbufs: Simple strings for printing to, with optional heap allocation
- *
- * This code has provisions for use in userspace, to aid in making other code
- * portable between kernelspace and userspace.
- *
- * Basic example:
- *   struct printbuf buf = PRINTBUF;
- *
- *   prt_printf(&buf, "foo=");
- *   foo_to_text(&buf, foo);
- *   printk("%s", buf.buf);
- *   printbuf_exit(&buf);
- *
- * Or
- *   struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
- *
- * We can now write pretty printers instead of writing code that dumps
- * everything to the kernel log buffer, and then those pretty-printers can be
- * used by other code that outputs to kernel log, sysfs, debugfs, etc.
- *
- * Memory allocation: Outputing to a printbuf may allocate memory. This
- * allocation is done with GFP_KERNEL, by default: use the newer
- * memalloc_*_(save|restore) functions as needed.
- *
- * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
- * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
- *
- * It's allowed to grab the output buffer and free it later with kfree() instead
- * of using printbuf_exit(), if the user just needs a heap allocated string at
- * the end.
- *
- * Memory allocation failures: We don't return errors directly, because on
- * memory allocation failure we usually don't want to bail out and unwind - we
- * want to print what we've got, on a best-effort basis. But code that does want
- * to return -ENOMEM may check printbuf.allocation_failure.
- *
- * Indenting, tabstops:
- *
- * To aid is writing multi-line pretty printers spread across multiple
- * functions, printbufs track the current indent level.
- *
- * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
- * level, respectively.
- *
- * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
- * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
- * prt_tab_rjust() will also advance the current line of text up to the next
- * tabstop, but it does so by shifting text since the previous tabstop up to the
- * next tabstop - right justifying it.
- *
- * Make sure you use prt_newline() instead of \n in the format string for indent
- * level and tabstops to work corretly.
- *
- * Output units: printbuf->units exists to tell pretty-printers how to output
- * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
- * human readable bytes. prt_units() obeys it.
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-enum printbuf_si {
-       PRINTBUF_UNITS_2,       /* use binary powers of 2^10 */
-       PRINTBUF_UNITS_10,      /* use powers of 10^3 (standard SI) */
-};
-
-#define PRINTBUF_INLINE_TABSTOPS       6
-
-struct printbuf {
-       char                    *buf;
-       unsigned                size;
-       unsigned                pos;
-       unsigned                last_newline;
-       unsigned                last_field;
-       unsigned                indent;
-       /*
-        * If nonzero, allocations will be done with GFP_ATOMIC:
-        */
-       u8                      atomic;
-       bool                    allocation_failure:1;
-       bool                    heap_allocated:1;
-       bool                    overflow:1;
-       enum printbuf_si        si_units:1;
-       bool                    human_readable_units:1;
-       bool                    has_indent_or_tabstops:1;
-       bool                    suppress_indent_tabstop_handling:1;
-       u8                      nr_tabstops;
-
-       /*
-        * Do not modify directly: use printbuf_tabstop_add(),
-        * printbuf_tabstop_get()
-        */
-       u8                      cur_tabstop;
-       u8                      _tabstops[PRINTBUF_INLINE_TABSTOPS];
-};
-
-int bch2_printbuf_make_room(struct printbuf *, unsigned);
-__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
-__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
-const char *bch2_printbuf_str(const struct printbuf *);
-void bch2_printbuf_exit(struct printbuf *);
-
-void bch2_printbuf_tabstops_reset(struct printbuf *);
-void bch2_printbuf_tabstop_pop(struct printbuf *);
-int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
-
-void bch2_printbuf_indent_add(struct printbuf *, unsigned);
-void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned);
-void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
-
-void bch2_prt_newline(struct printbuf *);
-void bch2_printbuf_strip_trailing_newline(struct printbuf *);
-void bch2_prt_tab(struct printbuf *);
-void bch2_prt_tab_rjust(struct printbuf *);
-
-void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
-void bch2_prt_human_readable_u64(struct printbuf *, u64);
-void bch2_prt_human_readable_s64(struct printbuf *, s64);
-void bch2_prt_units_u64(struct printbuf *, u64);
-void bch2_prt_units_s64(struct printbuf *, s64);
-void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
-void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
-void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
-                             unsigned long *, unsigned);
-
-/* Initializer for a heap allocated printbuf: */
-#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
-
-/* Initializer a printbuf that points to an external buffer: */
-#define PRINTBUF_EXTERN(_buf, _size)                   \
-((struct printbuf) {                                   \
-       .buf    = _buf,                                 \
-       .size   = _size,                                \
-})
-
-static inline struct printbuf bch2_printbuf_init(void)
-{
-       return PRINTBUF;
-}
-
-DEFINE_CLASS(printbuf, struct printbuf,
-            bch2_printbuf_exit(&_T), bch2_printbuf_init(), void)
-
-/*
- * Returns size remaining of output buffer:
- */
-static inline unsigned printbuf_remaining_size(struct printbuf *out)
-{
-       if (WARN_ON(out->size && out->pos >= out->size))
-               out->pos = out->size - 1;
-       return out->size - out->pos;
-}
-
-/*
- * Returns number of characters we can print to the output buffer - i.e.
- * excluding the terminating nul:
- */
-static inline unsigned printbuf_remaining(struct printbuf *out)
-{
-       return out->size ? printbuf_remaining_size(out) - 1 : 0;
-}
-
-static inline unsigned printbuf_written(struct printbuf *out)
-{
-       return out->size ? min(out->pos, out->size - 1) : 0;
-}
-
-static inline void printbuf_nul_terminate_reserved(struct printbuf *out)
-{
-       if (WARN_ON(out->size && out->pos >= out->size))
-               out->pos = out->size - 1;
-       if (out->size)
-               out->buf[out->pos] = 0;
-}
-
-static inline void printbuf_nul_terminate(struct printbuf *out)
-{
-       bch2_printbuf_make_room(out, 1);
-       printbuf_nul_terminate_reserved(out);
-}
-
-/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
-static inline void __prt_char_reserved(struct printbuf *out, char c)
-{
-       if (printbuf_remaining(out))
-               out->buf[out->pos++] = c;
-}
-
-/* Doesn't nul terminate: */
-static inline void __prt_char(struct printbuf *out, char c)
-{
-       bch2_printbuf_make_room(out, 1);
-       __prt_char_reserved(out, c);
-}
-
-static inline void prt_char(struct printbuf *out, char c)
-{
-       bch2_printbuf_make_room(out, 2);
-       __prt_char_reserved(out, c);
-       printbuf_nul_terminate_reserved(out);
-}
-
-static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
-{
-       unsigned can_print = min(n, printbuf_remaining(out));
-
-       for (unsigned i = 0; i < can_print; i++)
-               out->buf[out->pos++] = c;
-}
-
-static inline void prt_chars(struct printbuf *out, char c, unsigned n)
-{
-       bch2_printbuf_make_room(out, n);
-       __prt_chars_reserved(out, c, n);
-       printbuf_nul_terminate_reserved(out);
-}
-
-static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
-{
-       bch2_printbuf_make_room(out, n);
-
-       unsigned can_print = min(n, printbuf_remaining(out));
-
-       for (unsigned i = 0; i < can_print; i++)
-               out->buf[out->pos++] = ((char *) b)[i];
-
-       printbuf_nul_terminate(out);
-}
-
-static inline void prt_str(struct printbuf *out, const char *str)
-{
-       prt_bytes(out, str, strlen(str));
-}
-
-static inline void prt_str_indented(struct printbuf *out, const char *str)
-{
-       bch2_prt_bytes_indented(out, str, strlen(str));
-}
-
-static inline void prt_hex_byte(struct printbuf *out, u8 byte)
-{
-       bch2_printbuf_make_room(out, 3);
-       __prt_char_reserved(out, hex_asc_hi(byte));
-       __prt_char_reserved(out, hex_asc_lo(byte));
-       printbuf_nul_terminate_reserved(out);
-}
-
-static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
-{
-       bch2_printbuf_make_room(out, 3);
-       __prt_char_reserved(out, hex_asc_upper_hi(byte));
-       __prt_char_reserved(out, hex_asc_upper_lo(byte));
-       printbuf_nul_terminate_reserved(out);
-}
-
-static inline void printbuf_reset_keep_tabstops(struct printbuf *buf)
-{
-       buf->pos                = 0;
-       buf->allocation_failure = 0;
-       buf->last_newline       = 0;
-       buf->last_field         = 0;
-       buf->indent             = 0;
-       buf->cur_tabstop        = 0;
-}
-
-/**
- * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
- */
-static inline void printbuf_reset(struct printbuf *buf)
-{
-       printbuf_reset_keep_tabstops(buf);
-       buf->nr_tabstops        = 0;
-}
-
-/**
- * printbuf_atomic_inc - mark as entering an atomic section
- */
-static inline void printbuf_atomic_inc(struct printbuf *buf)
-{
-       buf->atomic++;
-}
-
-/**
- * printbuf_atomic_inc - mark as leaving an atomic section
- */
-static inline void printbuf_atomic_dec(struct printbuf *buf)
-{
-       buf->atomic--;
-}
-
-#endif /* _BCACHEFS_PRINTBUF_H */
diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c
deleted file mode 100644 (file)
index d098985..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "disk_accounting.h"
-#include "progress.h"
-
-void bch2_progress_init(struct progress_indicator_state *s,
-                       struct bch_fs *c,
-                       u64 btree_id_mask)
-{
-       memset(s, 0, sizeof(*s));
-
-       s->next_print = jiffies + HZ * 10;
-
-       for (unsigned i = 0; i < BTREE_ID_NR; i++) {
-               if (!(btree_id_mask & BIT_ULL(i)))
-                       continue;
-
-               struct disk_accounting_pos acc;
-               disk_accounting_key_init(acc, btree, .id = i);
-
-               u64 v;
-               bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
-               s->nodes_total += div64_ul(v, btree_sectors(c));
-       }
-}
-
-static inline bool progress_update_p(struct progress_indicator_state *s)
-{
-       bool ret = time_after_eq(jiffies, s->next_print);
-
-       if (ret)
-               s->next_print = jiffies + HZ * 10;
-       return ret;
-}
-
-void bch2_progress_update_iter(struct btree_trans *trans,
-                              struct progress_indicator_state *s,
-                              struct btree_iter *iter,
-                              const char *msg)
-{
-       struct bch_fs *c = trans->c;
-       struct btree *b = path_l(btree_iter_path(trans, iter))->b;
-
-       s->nodes_seen += b != s->last_node;
-       s->last_node = b;
-
-       if (progress_update_p(s)) {
-               struct printbuf buf = PRINTBUF;
-               unsigned percent = s->nodes_total
-                       ? div64_u64(s->nodes_seen * 100, s->nodes_total)
-                       : 0;
-
-               prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
-                          msg, percent, s->nodes_seen, s->nodes_total);
-               bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
-
-               bch_info(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-       }
-}
diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h
deleted file mode 100644 (file)
index 23fb181..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_PROGRESS_H
-#define _BCACHEFS_PROGRESS_H
-
-/*
- * Lame progress indicators
- *
- * We don't like to use these because they print to the dmesg console, which is
- * spammy - we much prefer to be wired up to a userspace programm (e.g. via
- * thread_with_file) and have it print the progress indicator.
- *
- * But some code is old and doesn't support that, or runs in a context where
- * that's not yet practical (mount).
- */
-
-struct progress_indicator_state {
-       unsigned long           next_print;
-       u64                     nodes_seen;
-       u64                     nodes_total;
-       struct btree            *last_node;
-};
-
-void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64);
-void bch2_progress_update_iter(struct btree_trans *,
-                              struct progress_indicator_state *,
-                              struct btree_iter *,
-                              const char *);
-
-#endif /* _BCACHEFS_PROGRESS_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
deleted file mode 100644 (file)
index f241efb..0000000
+++ /dev/null
@@ -1,892 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "quota.h"
-#include "snapshot.h"
-#include "super-io.h"
-
-static const char * const bch2_quota_types[] = {
-       "user",
-       "group",
-       "project",
-};
-
-static const char * const bch2_quota_counters[] = {
-       "space",
-       "inodes",
-};
-
-static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                 enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_quota *q = field_to_type(f, quota);
-
-       if (vstruct_bytes(&q->field) < sizeof(*q)) {
-               prt_printf(err, "wrong size (got %zu should be %zu)",
-                      vstruct_bytes(&q->field), sizeof(*q));
-               return -BCH_ERR_invalid_sb_quota;
-       }
-
-       return 0;
-}
-
-static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
-                                 struct bch_sb_field *f)
-{
-       struct bch_sb_field_quota *q = field_to_type(f, quota);
-       unsigned qtyp, counter;
-
-       for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
-               prt_printf(out, "%s: flags %llx",
-                      bch2_quota_types[qtyp],
-                      le64_to_cpu(q->q[qtyp].flags));
-
-               for (counter = 0; counter < Q_COUNTERS; counter++)
-                       prt_printf(out, " %s timelimit %u warnlimit %u",
-                              bch2_quota_counters[counter],
-                              le32_to_cpu(q->q[qtyp].c[counter].timelimit),
-                              le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
-
-               prt_newline(out);
-       }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_quota = {
-       .validate       = bch2_sb_quota_validate,
-       .to_text        = bch2_sb_quota_to_text,
-};
-
-int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k,
-                       struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       bkey_fsck_err_on(k.k->p.inode >= QTYP_NR,
-                        c, quota_type_invalid,
-                        "invalid quota type (%llu >= %u)",
-                        k.k->p.inode, QTYP_NR);
-fsck_err:
-       return ret;
-}
-
-void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
-                       struct bkey_s_c k)
-{
-       struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
-       unsigned i;
-
-       for (i = 0; i < Q_COUNTERS; i++)
-               prt_printf(out, "%s hardlimit %llu softlimit %llu",
-                      bch2_quota_counters[i],
-                      le64_to_cpu(dq.v->c[i].hardlimit),
-                      le64_to_cpu(dq.v->c[i].softlimit));
-}
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-#include <linux/cred.h>
-#include <linux/fs.h>
-#include <linux/quota.h>
-
-static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
-{
-       printbuf_tabstops_reset(out);
-       printbuf_tabstop_push(out, 20);
-
-       prt_printf(out, "i_fieldmask\t%x\n",            i->i_fieldmask);
-       prt_printf(out, "i_flags\t%u\n",                i->i_flags);
-       prt_printf(out, "i_spc_timelimit\t%u\n",        i->i_spc_timelimit);
-       prt_printf(out, "i_ino_timelimit\t%u\n",        i->i_ino_timelimit);
-       prt_printf(out, "i_rt_spc_timelimit\t%u\n",     i->i_rt_spc_timelimit);
-       prt_printf(out, "i_spc_warnlimit\t%u\n",        i->i_spc_warnlimit);
-       prt_printf(out, "i_ino_warnlimit\t%u\n",        i->i_ino_warnlimit);
-       prt_printf(out, "i_rt_spc_warnlimit\t%u\n",     i->i_rt_spc_warnlimit);
-}
-
-static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
-{
-       printbuf_tabstops_reset(out);
-       printbuf_tabstop_push(out, 20);
-
-       prt_printf(out, "d_fieldmask\t%x\n",            q->d_fieldmask);
-       prt_printf(out, "d_spc_hardlimit\t%llu\n",      q->d_spc_hardlimit);
-       prt_printf(out, "d_spc_softlimit\t%llu\n",      q->d_spc_softlimit);
-       prt_printf(out, "d_ino_hardlimit\%llu\n",       q->d_ino_hardlimit);
-       prt_printf(out, "d_ino_softlimit\t%llu\n",      q->d_ino_softlimit);
-       prt_printf(out, "d_space\t%llu\n",              q->d_space);
-       prt_printf(out, "d_ino_count\t%llu\n",          q->d_ino_count);
-       prt_printf(out, "d_ino_timer\t%llu\n",          q->d_ino_timer);
-       prt_printf(out, "d_spc_timer\t%llu\n",          q->d_spc_timer);
-       prt_printf(out, "d_ino_warns\t%i\n",            q->d_ino_warns);
-       prt_printf(out, "d_spc_warns\t%i\n",            q->d_spc_warns);
-}
-
-static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
-{
-       qtypes >>= i;
-       return qtypes ? i + __ffs(qtypes) : QTYP_NR;
-}
-
-#define for_each_set_qtype(_c, _i, _q, _qtypes)                                \
-       for (_i = 0;                                                    \
-            (_i = __next_qtype(_i, _qtypes),                           \
-             _q = &(_c)->quotas[_i],                                   \
-             _i < QTYP_NR);                                            \
-            _i++)
-
-static bool ignore_hardlimit(struct bch_memquota_type *q)
-{
-       if (capable(CAP_SYS_RESOURCE))
-               return true;
-#if 0
-       struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
-
-       return capable(CAP_SYS_RESOURCE) &&
-              (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
-               !(info->dqi_flags & DQF_ROOT_SQUASH));
-#endif
-       return false;
-}
-
-enum quota_msg {
-       SOFTWARN,       /* Softlimit reached */
-       SOFTLONGWARN,   /* Grace time expired */
-       HARDWARN,       /* Hardlimit reached */
-
-       HARDBELOW,      /* Usage got below inode hardlimit */
-       SOFTBELOW,      /* Usage got below inode softlimit */
-};
-
-static int quota_nl[][Q_COUNTERS] = {
-       [HARDWARN][Q_SPC]       = QUOTA_NL_BHARDWARN,
-       [SOFTLONGWARN][Q_SPC]   = QUOTA_NL_BSOFTLONGWARN,
-       [SOFTWARN][Q_SPC]       = QUOTA_NL_BSOFTWARN,
-       [HARDBELOW][Q_SPC]      = QUOTA_NL_BHARDBELOW,
-       [SOFTBELOW][Q_SPC]      = QUOTA_NL_BSOFTBELOW,
-
-       [HARDWARN][Q_INO]       = QUOTA_NL_IHARDWARN,
-       [SOFTLONGWARN][Q_INO]   = QUOTA_NL_ISOFTLONGWARN,
-       [SOFTWARN][Q_INO]       = QUOTA_NL_ISOFTWARN,
-       [HARDBELOW][Q_INO]      = QUOTA_NL_IHARDBELOW,
-       [SOFTBELOW][Q_INO]      = QUOTA_NL_ISOFTBELOW,
-};
-
-struct quota_msgs {
-       u8              nr;
-       struct {
-               u8      qtype;
-               u8      msg;
-       }               m[QTYP_NR * Q_COUNTERS];
-};
-
-static void prepare_msg(unsigned qtype,
-                       enum quota_counters counter,
-                       struct quota_msgs *msgs,
-                       enum quota_msg msg_type)
-{
-       BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
-
-       msgs->m[msgs->nr].qtype = qtype;
-       msgs->m[msgs->nr].msg   = quota_nl[msg_type][counter];
-       msgs->nr++;
-}
-
-static void prepare_warning(struct memquota_counter *qc,
-                           unsigned qtype,
-                           enum quota_counters counter,
-                           struct quota_msgs *msgs,
-                           enum quota_msg msg_type)
-{
-       if (qc->warning_issued & (1 << msg_type))
-               return;
-
-       prepare_msg(qtype, counter, msgs, msg_type);
-}
-
-static void flush_warnings(struct bch_qid qid,
-                          struct super_block *sb,
-                          struct quota_msgs *msgs)
-{
-       unsigned i;
-
-       for (i = 0; i < msgs->nr; i++)
-               quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
-                                  sb->s_dev, msgs->m[i].msg);
-}
-
-static int bch2_quota_check_limit(struct bch_fs *c,
-                                 unsigned qtype,
-                                 struct bch_memquota *mq,
-                                 struct quota_msgs *msgs,
-                                 enum quota_counters counter,
-                                 s64 v,
-                                 enum quota_acct_mode mode)
-{
-       struct bch_memquota_type *q = &c->quotas[qtype];
-       struct memquota_counter *qc = &mq->c[counter];
-       u64 n = qc->v + v;
-
-       BUG_ON((s64) n < 0);
-
-       if (mode == KEY_TYPE_QUOTA_NOCHECK)
-               return 0;
-
-       if (v <= 0) {
-               if (n < qc->hardlimit &&
-                   (qc->warning_issued & (1 << HARDWARN))) {
-                       qc->warning_issued &= ~(1 << HARDWARN);
-                       prepare_msg(qtype, counter, msgs, HARDBELOW);
-               }
-
-               if (n < qc->softlimit &&
-                   (qc->warning_issued & (1 << SOFTWARN))) {
-                       qc->warning_issued &= ~(1 << SOFTWARN);
-                       prepare_msg(qtype, counter, msgs, SOFTBELOW);
-               }
-
-               qc->warning_issued = 0;
-               return 0;
-       }
-
-       if (qc->hardlimit &&
-           qc->hardlimit < n &&
-           !ignore_hardlimit(q)) {
-               prepare_warning(qc, qtype, counter, msgs, HARDWARN);
-               return -EDQUOT;
-       }
-
-       if (qc->softlimit &&
-           qc->softlimit < n) {
-               if (qc->timer == 0) {
-                       qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
-                       prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
-               } else if (ktime_get_real_seconds() >= qc->timer &&
-                          !ignore_hardlimit(q)) {
-                       prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
-                       return -EDQUOT;
-               }
-       }
-
-       return 0;
-}
-
-int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
-                   enum quota_counters counter, s64 v,
-                   enum quota_acct_mode mode)
-{
-       unsigned qtypes = enabled_qtypes(c);
-       struct bch_memquota_type *q;
-       struct bch_memquota *mq[QTYP_NR];
-       struct quota_msgs msgs;
-       unsigned i;
-       int ret = 0;
-
-       memset(&msgs, 0, sizeof(msgs));
-
-       for_each_set_qtype(c, i, q, qtypes) {
-               mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL);
-               if (!mq[i])
-                       return -ENOMEM;
-       }
-
-       for_each_set_qtype(c, i, q, qtypes)
-               mutex_lock_nested(&q->lock, i);
-
-       for_each_set_qtype(c, i, q, qtypes) {
-               ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
-               if (ret)
-                       goto err;
-       }
-
-       for_each_set_qtype(c, i, q, qtypes)
-               mq[i]->c[counter].v += v;
-err:
-       for_each_set_qtype(c, i, q, qtypes)
-               mutex_unlock(&q->lock);
-
-       flush_warnings(qid, c->vfs_sb, &msgs);
-
-       return ret;
-}
-
-static void __bch2_quota_transfer(struct bch_memquota *src_q,
-                                 struct bch_memquota *dst_q,
-                                 enum quota_counters counter, s64 v)
-{
-       BUG_ON(v > src_q->c[counter].v);
-       BUG_ON(v + dst_q->c[counter].v < v);
-
-       src_q->c[counter].v -= v;
-       dst_q->c[counter].v += v;
-}
-
-int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
-                       struct bch_qid dst,
-                       struct bch_qid src, u64 space,
-                       enum quota_acct_mode mode)
-{
-       struct bch_memquota_type *q;
-       struct bch_memquota *src_q[3], *dst_q[3];
-       struct quota_msgs msgs;
-       unsigned i;
-       int ret = 0;
-
-       qtypes &= enabled_qtypes(c);
-
-       memset(&msgs, 0, sizeof(msgs));
-
-       for_each_set_qtype(c, i, q, qtypes) {
-               src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL);
-               dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL);
-               if (!src_q[i] || !dst_q[i])
-                       return -ENOMEM;
-       }
-
-       for_each_set_qtype(c, i, q, qtypes)
-               mutex_lock_nested(&q->lock, i);
-
-       for_each_set_qtype(c, i, q, qtypes) {
-               ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
-                                            dst_q[i]->c[Q_SPC].v + space,
-                                            mode);
-               if (ret)
-                       goto err;
-
-               ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
-                                            dst_q[i]->c[Q_INO].v + 1,
-                                            mode);
-               if (ret)
-                       goto err;
-       }
-
-       for_each_set_qtype(c, i, q, qtypes) {
-               __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
-               __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
-       }
-
-err:
-       for_each_set_qtype(c, i, q, qtypes)
-               mutex_unlock(&q->lock);
-
-       flush_warnings(dst, c->vfs_sb, &msgs);
-
-       return ret;
-}
-
-static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
-                           struct qc_dqblk *qdq)
-{
-       struct bkey_s_c_quota dq;
-       struct bch_memquota_type *q;
-       struct bch_memquota *mq;
-       unsigned i;
-
-       BUG_ON(k.k->p.inode >= QTYP_NR);
-
-       if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
-               return 0;
-
-       switch (k.k->type) {
-       case KEY_TYPE_quota:
-               dq = bkey_s_c_to_quota(k);
-               q = &c->quotas[k.k->p.inode];
-
-               mutex_lock(&q->lock);
-               mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
-               if (!mq) {
-                       mutex_unlock(&q->lock);
-                       return -ENOMEM;
-               }
-
-               for (i = 0; i < Q_COUNTERS; i++) {
-                       mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
-                       mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
-               }
-
-               if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
-                       mq->c[Q_SPC].timer      = qdq->d_spc_timer;
-               if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
-                       mq->c[Q_SPC].warns      = qdq->d_spc_warns;
-               if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
-                       mq->c[Q_INO].timer      = qdq->d_ino_timer;
-               if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
-                       mq->c[Q_INO].warns      = qdq->d_ino_warns;
-
-               mutex_unlock(&q->lock);
-       }
-
-       return 0;
-}
-
-void bch2_fs_quota_exit(struct bch_fs *c)
-{
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
-               genradix_free(&c->quotas[i].table);
-}
-
-void bch2_fs_quota_init(struct bch_fs *c)
-{
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
-               mutex_init(&c->quotas[i].lock);
-}
-
-static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
-{
-       struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
-
-       if (sb_quota)
-               return sb_quota;
-
-       sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
-       if (sb_quota) {
-               unsigned qtype, qc;
-
-               for (qtype = 0; qtype < QTYP_NR; qtype++)
-                       for (qc = 0; qc < Q_COUNTERS; qc++)
-                               sb_quota->q[qtype].c[qc].timelimit =
-                                       cpu_to_le32(7 * 24 * 60 * 60);
-       }
-
-       return sb_quota;
-}
-
-static void bch2_sb_quota_read(struct bch_fs *c)
-{
-       struct bch_sb_field_quota *sb_quota;
-       unsigned i, j;
-
-       sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
-       if (!sb_quota)
-               return;
-
-       for (i = 0; i < QTYP_NR; i++) {
-               struct bch_memquota_type *q = &c->quotas[i];
-
-               for (j = 0; j < Q_COUNTERS; j++) {
-                       q->limits[j].timelimit =
-                               le32_to_cpu(sb_quota->q[i].c[j].timelimit);
-                       q->limits[j].warnlimit =
-                               le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
-               }
-       }
-}
-
-static int bch2_fs_quota_read_inode(struct btree_trans *trans,
-                                   struct btree_iter *iter,
-                                   struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_inode_unpacked u;
-       struct bch_snapshot_tree s_t;
-       u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot);
-
-       int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t);
-       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-                       "%s: snapshot tree %u not found", __func__, tree);
-       if (ret)
-               return ret;
-
-       if (!s_t.master_subvol)
-               goto advance;
-
-       ret = bch2_inode_find_by_inum_nowarn_trans(trans,
-                               (subvol_inum) {
-                                       le32_to_cpu(s_t.master_subvol),
-                                       k.k->p.offset,
-                               }, &u);
-       /*
-        * Inode might be deleted in this snapshot - the easiest way to handle
-        * that is to just skip it here:
-        */
-       if (bch2_err_matches(ret, ENOENT))
-               goto advance;
-
-       if (ret)
-               return ret;
-
-       bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
-                       KEY_TYPE_QUOTA_NOCHECK);
-       bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
-                       KEY_TYPE_QUOTA_NOCHECK);
-advance:
-       bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
-       return 0;
-}
-
-int bch2_fs_quota_read(struct bch_fs *c)
-{
-
-       mutex_lock(&c->sb_lock);
-       struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
-       if (!sb_quota) {
-               mutex_unlock(&c->sb_lock);
-               return bch_err_throw(c, ENOSPC_sb_quota);
-       }
-
-       bch2_sb_quota_read(c);
-       mutex_unlock(&c->sb_lock);
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
-                                  BTREE_ITER_prefetch, k,
-                       __bch2_quota_set(c, k, NULL)) ?:
-               for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
-                                  BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-                       bch2_fs_quota_read_inode(trans, &iter, k)));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-/* Enable/disable/delete quotas for an entire filesystem: */
-
-static int bch2_quota_enable(struct super_block        *sb, unsigned uflags)
-{
-       struct bch_fs *c = sb->s_fs_info;
-       struct bch_sb_field_quota *sb_quota;
-       int ret = 0;
-
-       if (sb->s_flags & SB_RDONLY)
-               return -EROFS;
-
-       /* Accounting must be enabled at mount time: */
-       if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
-               return -EINVAL;
-
-       /* Can't enable enforcement without accounting: */
-       if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
-               return -EINVAL;
-
-       if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
-               return -EINVAL;
-
-       if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
-               return -EINVAL;
-
-       mutex_lock(&c->sb_lock);
-       sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
-       if (!sb_quota) {
-               ret = bch_err_throw(c, ENOSPC_sb_quota);
-               goto unlock;
-       }
-
-       if (uflags & FS_QUOTA_UDQ_ENFD)
-               SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
-
-       if (uflags & FS_QUOTA_GDQ_ENFD)
-               SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
-
-       if (uflags & FS_QUOTA_PDQ_ENFD)
-               SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
-
-       bch2_write_super(c);
-unlock:
-       mutex_unlock(&c->sb_lock);
-
-       return bch2_err_class(ret);
-}
-
-static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
-{
-       struct bch_fs *c = sb->s_fs_info;
-
-       if (sb->s_flags & SB_RDONLY)
-               return -EROFS;
-
-       mutex_lock(&c->sb_lock);
-       if (uflags & FS_QUOTA_UDQ_ENFD)
-               SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
-
-       if (uflags & FS_QUOTA_GDQ_ENFD)
-               SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
-
-       if (uflags & FS_QUOTA_PDQ_ENFD)
-               SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       return 0;
-}
-
-static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
-{
-       struct bch_fs *c = sb->s_fs_info;
-       int ret;
-
-       if (sb->s_flags & SB_RDONLY)
-               return -EROFS;
-
-       if (uflags & FS_USER_QUOTA) {
-               if (c->opts.usrquota)
-                       return -EINVAL;
-
-               ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
-                                             POS(QTYP_USR, 0),
-                                             POS(QTYP_USR, U64_MAX),
-                                             0, NULL);
-               if (ret)
-                       return ret;
-       }
-
-       if (uflags & FS_GROUP_QUOTA) {
-               if (c->opts.grpquota)
-                       return -EINVAL;
-
-               ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
-                                             POS(QTYP_GRP, 0),
-                                             POS(QTYP_GRP, U64_MAX),
-                                             0, NULL);
-               if (ret)
-                       return ret;
-       }
-
-       if (uflags & FS_PROJ_QUOTA) {
-               if (c->opts.prjquota)
-                       return -EINVAL;
-
-               ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
-                                             POS(QTYP_PRJ, 0),
-                                             POS(QTYP_PRJ, U64_MAX),
-                                             0, NULL);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-/*
- * Return quota status information, such as enforcements, quota file inode
- * numbers etc.
- */
-static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
-{
-       struct bch_fs *c = sb->s_fs_info;
-       unsigned qtypes = enabled_qtypes(c);
-       unsigned i;
-
-       memset(state, 0, sizeof(*state));
-
-       for (i = 0; i < QTYP_NR; i++) {
-               state->s_state[i].flags |= QCI_SYSFILE;
-
-               if (!(qtypes & (1 << i)))
-                       continue;
-
-               state->s_state[i].flags |= QCI_ACCT_ENABLED;
-
-               state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
-               state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
-
-               state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
-               state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
-       }
-
-       return 0;
-}
-
-/*
- * Adjust quota timers & warnings
- */
-static int bch2_quota_set_info(struct super_block *sb, int type,
-                              struct qc_info *info)
-{
-       struct bch_fs *c = sb->s_fs_info;
-       struct bch_sb_field_quota *sb_quota;
-       int ret = 0;
-
-       if (0) {
-               struct printbuf buf = PRINTBUF;
-
-               qc_info_to_text(&buf, info);
-               pr_info("setting:\n%s", buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       if (sb->s_flags & SB_RDONLY)
-               return -EROFS;
-
-       if (type >= QTYP_NR)
-               return -EINVAL;
-
-       if (!((1 << type) & enabled_qtypes(c)))
-               return -ESRCH;
-
-       if (info->i_fieldmask &
-           ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
-               return -EINVAL;
-
-       mutex_lock(&c->sb_lock);
-       sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
-       if (!sb_quota) {
-               ret = bch_err_throw(c, ENOSPC_sb_quota);
-               goto unlock;
-       }
-
-       if (info->i_fieldmask & QC_SPC_TIMER)
-               sb_quota->q[type].c[Q_SPC].timelimit =
-                       cpu_to_le32(info->i_spc_timelimit);
-
-       if (info->i_fieldmask & QC_SPC_WARNS)
-               sb_quota->q[type].c[Q_SPC].warnlimit =
-                       cpu_to_le32(info->i_spc_warnlimit);
-
-       if (info->i_fieldmask & QC_INO_TIMER)
-               sb_quota->q[type].c[Q_INO].timelimit =
-                       cpu_to_le32(info->i_ino_timelimit);
-
-       if (info->i_fieldmask & QC_INO_WARNS)
-               sb_quota->q[type].c[Q_INO].warnlimit =
-                       cpu_to_le32(info->i_ino_warnlimit);
-
-       bch2_sb_quota_read(c);
-
-       bch2_write_super(c);
-unlock:
-       mutex_unlock(&c->sb_lock);
-
-       return bch2_err_class(ret);
-}
-
-/* Get/set individual quotas: */
-
-static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
-{
-       dst->d_space            = src->c[Q_SPC].v << 9;
-       dst->d_spc_hardlimit    = src->c[Q_SPC].hardlimit << 9;
-       dst->d_spc_softlimit    = src->c[Q_SPC].softlimit << 9;
-       dst->d_spc_timer        = src->c[Q_SPC].timer;
-       dst->d_spc_warns        = src->c[Q_SPC].warns;
-
-       dst->d_ino_count        = src->c[Q_INO].v;
-       dst->d_ino_hardlimit    = src->c[Q_INO].hardlimit;
-       dst->d_ino_softlimit    = src->c[Q_INO].softlimit;
-       dst->d_ino_timer        = src->c[Q_INO].timer;
-       dst->d_ino_warns        = src->c[Q_INO].warns;
-}
-
-static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
-                         struct qc_dqblk *qdq)
-{
-       struct bch_fs *c                = sb->s_fs_info;
-       struct bch_memquota_type *q     = &c->quotas[kqid.type];
-       qid_t qid                       = from_kqid(&init_user_ns, kqid);
-       struct bch_memquota *mq;
-
-       memset(qdq, 0, sizeof(*qdq));
-
-       mutex_lock(&q->lock);
-       mq = genradix_ptr(&q->table, qid);
-       if (mq)
-               __bch2_quota_get(qdq, mq);
-       mutex_unlock(&q->lock);
-
-       return 0;
-}
-
-static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
-                              struct qc_dqblk *qdq)
-{
-       struct bch_fs *c                = sb->s_fs_info;
-       struct bch_memquota_type *q     = &c->quotas[kqid->type];
-       qid_t qid                       = from_kqid(&init_user_ns, *kqid);
-       struct genradix_iter iter;
-       struct bch_memquota *mq;
-       int ret = 0;
-
-       mutex_lock(&q->lock);
-
-       genradix_for_each_from(&q->table, iter, mq, qid)
-               if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
-                       __bch2_quota_get(qdq, mq);
-                       *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
-                       goto found;
-               }
-
-       ret = -ENOENT;
-found:
-       mutex_unlock(&q->lock);
-       return bch2_err_class(ret);
-}
-
-static int bch2_set_quota_trans(struct btree_trans *trans,
-                               struct bkey_i_quota *new_quota,
-                               struct qc_dqblk *qdq)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
-                              BTREE_ITER_slots|BTREE_ITER_intent);
-       ret = bkey_err(k);
-       if (unlikely(ret))
-               return ret;
-
-       if (k.k->type == KEY_TYPE_quota)
-               new_quota->v = *bkey_s_c_to_quota(k).v;
-
-       if (qdq->d_fieldmask & QC_SPC_SOFT)
-               new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
-       if (qdq->d_fieldmask & QC_SPC_HARD)
-               new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
-
-       if (qdq->d_fieldmask & QC_INO_SOFT)
-               new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
-       if (qdq->d_fieldmask & QC_INO_HARD)
-               new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
-
-       ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int bch2_set_quota(struct super_block *sb, struct kqid qid,
-                         struct qc_dqblk *qdq)
-{
-       struct bch_fs *c = sb->s_fs_info;
-       struct bkey_i_quota new_quota;
-       int ret;
-
-       if (0) {
-               struct printbuf buf = PRINTBUF;
-
-               qc_dqblk_to_text(&buf, qdq);
-               pr_info("setting:\n%s", buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       if (sb->s_flags & SB_RDONLY)
-               return -EROFS;
-
-       bkey_quota_init(&new_quota.k_i);
-       new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
-
-       ret = bch2_trans_commit_do(c, NULL, NULL, 0,
-                           bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
-               __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
-
-       return bch2_err_class(ret);
-}
-
-const struct quotactl_ops bch2_quotactl_operations = {
-       .quota_enable           = bch2_quota_enable,
-       .quota_disable          = bch2_quota_disable,
-       .rm_xquota              = bch2_quota_remove,
-
-       .get_state              = bch2_quota_get_state,
-       .set_info               = bch2_quota_set_info,
-
-       .get_dqblk              = bch2_get_quota,
-       .get_nextdqblk          = bch2_get_next_quota,
-       .set_dqblk              = bch2_set_quota,
-};
-
-#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
deleted file mode 100644 (file)
index 1551800..0000000
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_H
-#define _BCACHEFS_QUOTA_H
-
-#include "inode.h"
-#include "quota_types.h"
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-
-int bch2_quota_validate(struct bch_fs *, struct bkey_s_c,
-                       struct bkey_validate_context);
-void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_quota ((struct bkey_ops) {       \
-       .key_validate   = bch2_quota_validate,          \
-       .val_to_text    = bch2_quota_to_text,           \
-       .min_val_size   = 32,                           \
-})
-
-static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
-{
-       return (struct bch_qid) {
-               .q[QTYP_USR] = u->bi_uid,
-               .q[QTYP_GRP] = u->bi_gid,
-               .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
-       };
-}
-
-static inline unsigned enabled_qtypes(struct bch_fs *c)
-{
-       return ((c->opts.usrquota << QTYP_USR)|
-               (c->opts.grpquota << QTYP_GRP)|
-               (c->opts.prjquota << QTYP_PRJ));
-}
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
-                   s64, enum quota_acct_mode);
-
-int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
-                       struct bch_qid, u64, enum quota_acct_mode);
-
-void bch2_fs_quota_exit(struct bch_fs *);
-void bch2_fs_quota_init(struct bch_fs *);
-int bch2_fs_quota_read(struct bch_fs *);
-
-extern const struct quotactl_ops bch2_quotactl_operations;
-
-#else
-
-static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
-                                 enum quota_counters counter, s64 v,
-                                 enum quota_acct_mode mode)
-{
-       return 0;
-}
-
-static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
-                                     struct bch_qid dst,
-                                     struct bch_qid src, u64 space,
-                                     enum quota_acct_mode mode)
-{
-       return 0;
-}
-
-static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
-static inline void bch2_fs_quota_init(struct bch_fs *c) {}
-static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
-
-#endif
-
-#endif /* _BCACHEFS_QUOTA_H */
diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h
deleted file mode 100644 (file)
index dc34347..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_FORMAT_H
-#define _BCACHEFS_QUOTA_FORMAT_H
-
-/* KEY_TYPE_quota: */
-
-enum quota_types {
-       QTYP_USR                = 0,
-       QTYP_GRP                = 1,
-       QTYP_PRJ                = 2,
-       QTYP_NR                 = 3,
-};
-
-enum quota_counters {
-       Q_SPC                   = 0,
-       Q_INO                   = 1,
-       Q_COUNTERS              = 2,
-};
-
-struct bch_quota_counter {
-       __le64                  hardlimit;
-       __le64                  softlimit;
-};
-
-struct bch_quota {
-       struct bch_val          v;
-       struct bch_quota_counter c[Q_COUNTERS];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
-       __le32                          timelimit;
-       __le32                          warnlimit;
-};
-
-struct bch_sb_quota_type {
-       __le64                          flags;
-       struct bch_sb_quota_counter     c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
-       struct bch_sb_field             field;
-       struct bch_sb_quota_type        q[QTYP_NR];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_QUOTA_FORMAT_H */
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
deleted file mode 100644 (file)
index 6a13608..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_TYPES_H
-#define _BCACHEFS_QUOTA_TYPES_H
-
-#include <linux/generic-radix-tree.h>
-
-struct bch_qid {
-       u32             q[QTYP_NR];
-};
-
-enum quota_acct_mode {
-       KEY_TYPE_QUOTA_PREALLOC,
-       KEY_TYPE_QUOTA_WARN,
-       KEY_TYPE_QUOTA_NOCHECK,
-};
-
-struct memquota_counter {
-       u64                             v;
-       u64                             hardlimit;
-       u64                             softlimit;
-       s64                             timer;
-       int                             warns;
-       int                             warning_issued;
-};
-
-struct bch_memquota {
-       struct memquota_counter         c[Q_COUNTERS];
-};
-
-typedef GENRADIX(struct bch_memquota)  bch_memquota_table;
-
-struct quota_limit {
-       u32                             timelimit;
-       u32                             warnlimit;
-};
-
-struct bch_memquota_type {
-       struct quota_limit              limits[Q_COUNTERS];
-       bch_memquota_table              table;
-       struct mutex                    lock;
-};
-
-#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
deleted file mode 100644 (file)
index b1438be..0000000
+++ /dev/null
@@ -1,666 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define pr_fmt(fmt) "%s() " fmt "\n", __func__
-
-#include <linux/generic-radix-tree.h>
-#include <linux/mm.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/srcu.h>
-#include <linux/vmalloc.h>
-
-#include "rcu_pending.h"
-#include "darray.h"
-#include "util.h"
-
-#define static_array_for_each(_a, _i)                  \
-       for (typeof(&(_a)[0]) _i = _a;                  \
-            _i < (_a) + ARRAY_SIZE(_a);                \
-            _i++)
-
-enum rcu_pending_special {
-       RCU_PENDING_KVFREE      = 1,
-       RCU_PENDING_CALL_RCU    = 2,
-};
-
-#define RCU_PENDING_KVFREE_FN          ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
-#define RCU_PENDING_CALL_RCU_FN                ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
-
-#ifdef __KERNEL__
-typedef unsigned long                  rcu_gp_poll_state_t;
-
-static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
-{
-       return l == r;
-}
-#else
-typedef struct urcu_gp_poll_state      rcu_gp_poll_state_t;
-
-static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
-{
-       return l.grace_period_id == r.grace_period_id;
-}
-#endif
-
-static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp)
-{
-       return ssp
-               ? get_state_synchronize_srcu(ssp)
-               : get_state_synchronize_rcu();
-}
-
-static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp)
-{
-       return ssp
-               ? start_poll_synchronize_srcu(ssp)
-               : start_poll_synchronize_rcu();
-}
-
-static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie)
-{
-       return ssp
-               ? poll_state_synchronize_srcu(ssp, cookie)
-               : poll_state_synchronize_rcu(cookie);
-}
-
-static inline void __rcu_barrier(struct srcu_struct *ssp)
-{
-       return ssp
-               ? srcu_barrier(ssp)
-               : rcu_barrier();
-}
-
-static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
-                             rcu_callback_t func)
-{
-       if (ssp)
-               call_srcu(ssp, rhp, func);
-       else
-               call_rcu(rhp, func);
-}
-
-struct rcu_pending_seq {
-       /*
-        * We're using a radix tree like a vector - we're just pushing elements
-        * onto the end; we're using a radix tree instead of an actual vector to
-        * avoid reallocation overhead
-        */
-       GENRADIX(struct rcu_head *)     objs;
-       size_t                          nr;
-       struct rcu_head                 **cursor;
-       rcu_gp_poll_state_t             seq;
-};
-
-struct rcu_pending_list {
-       struct rcu_head                 *head;
-       struct rcu_head                 *tail;
-       rcu_gp_poll_state_t             seq;
-};
-
-struct rcu_pending_pcpu {
-       struct rcu_pending              *parent;
-       spinlock_t                      lock;
-       int                             cpu;
-
-       /*
-        * We can't bound the number of unprocessed gp sequence numbers, and we
-        * can't efficiently merge radix trees for expired grace periods, so we
-        * need darray/vector:
-        */
-       DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
-
-       /* Third entry is for expired objects: */
-       struct rcu_pending_list         lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
-
-       struct rcu_head                 cb;
-       bool                            cb_armed;
-       struct work_struct              work;
-};
-
-static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
-{
-       if (p->objs.nr)
-               return true;
-
-       static_array_for_each(p->lists, i)
-               if (i->head)
-                       return true;
-
-       return false;
-}
-
-static void rcu_pending_list_merge(struct rcu_pending_list *l1,
-                                  struct rcu_pending_list *l2)
-{
-#ifdef __KERNEL__
-       if (!l1->head)
-               l1->head = l2->head;
-       else
-               l1->tail->next = l2->head;
-#else
-       if (!l1->head)
-               l1->head = l2->head;
-       else
-               l1->tail->next.next = (void *) l2->head;
-#endif
-
-       l1->tail = l2->tail;
-       l2->head = l2->tail = NULL;
-}
-
-static void rcu_pending_list_add(struct rcu_pending_list *l,
-                                struct rcu_head *n)
-{
-#ifdef __KERNEL__
-       if (!l->head)
-               l->head = n;
-       else
-               l->tail->next = n;
-       l->tail = n;
-       n->next = NULL;
-#else
-       if (!l->head)
-               l->head = n;
-       else
-               l->tail->next.next = (void *) n;
-       l->tail = n;
-       n->next.next = NULL;
-#endif
-}
-
-static void merge_expired_lists(struct rcu_pending_pcpu *p)
-{
-       struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
-
-       for (struct rcu_pending_list *i = p->lists; i < expired; i++)
-               if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
-                       rcu_pending_list_merge(expired, i);
-}
-
-#ifndef __KERNEL__
-static inline void kfree_bulk(size_t nr, void ** p)
-{
-       while (nr--)
-               kfree(*p);
-}
-#endif
-
-static noinline void __process_finished_items(struct rcu_pending *pending,
-                                             struct rcu_pending_pcpu *p,
-                                             unsigned long flags)
-{
-       struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
-       struct rcu_pending_seq objs = {};
-       struct rcu_head *list = NULL;
-
-       if (p->objs.nr &&
-           __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
-               objs = p->objs.data[0];
-               darray_remove_item(&p->objs, p->objs.data);
-       }
-
-       merge_expired_lists(p);
-
-       list = expired->head;
-       expired->head = expired->tail = NULL;
-
-       spin_unlock_irqrestore(&p->lock, flags);
-
-       switch ((ulong) pending->process) {
-       case RCU_PENDING_KVFREE:
-               for (size_t i = 0; i < objs.nr; ) {
-                       size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
-
-                       kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
-                       i += nr_this_node;
-               }
-               genradix_free(&objs.objs);
-
-               while (list) {
-                       struct rcu_head *obj = list;
-#ifdef __KERNEL__
-                       list = obj->next;
-#else
-                       list = (void *) obj->next.next;
-#endif
-
-                       /*
-                        * low bit of pointer indicates whether rcu_head needs
-                        * to be freed - kvfree_rcu_mightsleep()
-                        */
-                       BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
-
-                       void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
-                       bool free_head = ((unsigned long) obj->func) & 1UL;
-
-                       kvfree(ptr);
-                       if (free_head)
-                               kfree(obj);
-               }
-
-               break;
-
-       case RCU_PENDING_CALL_RCU:
-               for (size_t i = 0; i < objs.nr; i++) {
-                       struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
-                       obj->func(obj);
-               }
-               genradix_free(&objs.objs);
-
-               while (list) {
-                       struct rcu_head *obj = list;
-#ifdef __KERNEL__
-                       list = obj->next;
-#else
-                       list = (void *) obj->next.next;
-#endif
-                       obj->func(obj);
-               }
-               break;
-
-       default:
-               for (size_t i = 0; i < objs.nr; i++)
-                       pending->process(pending, *genradix_ptr(&objs.objs, i));
-               genradix_free(&objs.objs);
-
-               while (list) {
-                       struct rcu_head *obj = list;
-#ifdef __KERNEL__
-                       list = obj->next;
-#else
-                       list = (void *) obj->next.next;
-#endif
-                       pending->process(pending, obj);
-               }
-               break;
-       }
-}
-
-static bool process_finished_items(struct rcu_pending *pending,
-                                  struct rcu_pending_pcpu *p,
-                                  unsigned long flags)
-{
-       /*
-        * XXX: we should grab the gp seq once and avoid multiple function
-        * calls, this is called from __rcu_pending_enqueue() fastpath in
-        * may_sleep==true mode
-        */
-       if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
-           (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
-           (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
-           p->lists[2].head) {
-               __process_finished_items(pending, p, flags);
-               return true;
-       }
-
-       return false;
-}
-
-static void rcu_pending_work(struct work_struct *work)
-{
-       struct rcu_pending_pcpu *p =
-               container_of(work, struct rcu_pending_pcpu, work);
-       struct rcu_pending *pending = p->parent;
-       unsigned long flags;
-
-       do {
-               spin_lock_irqsave(&p->lock, flags);
-       } while (process_finished_items(pending, p, flags));
-
-       spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void rcu_pending_rcu_cb(struct rcu_head *rcu)
-{
-       struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
-
-       schedule_work_on(p->cpu, &p->work);
-
-       unsigned long flags;
-       spin_lock_irqsave(&p->lock, flags);
-       if (__rcu_pending_has_pending(p)) {
-               spin_unlock_irqrestore(&p->lock, flags);
-               __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
-       } else {
-               p->cb_armed = false;
-               spin_unlock_irqrestore(&p->lock, flags);
-       }
-}
-
-static __always_inline struct rcu_pending_seq *
-get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq)
-{
-       darray_for_each_reverse(p->objs, objs)
-               if (rcu_gp_poll_cookie_eq(objs->seq, seq))
-                       return objs;
-
-       if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
-               return NULL;
-
-       return &darray_last(p->objs);
-}
-
-static noinline bool
-rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq,
-                        struct rcu_head *head, void *ptr,
-                        unsigned long *flags)
-{
-       if (ptr) {
-               if (!head) {
-                       /*
-                        * kvfree_rcu_mightsleep(): we weren't passed an
-                        * rcu_head, but we need one: use the low bit of the
-                        * ponter to free to flag that the head needs to be
-                        * freed as well:
-                        */
-                       ptr = (void *)(((unsigned long) ptr)|1UL);
-                       head = kmalloc(sizeof(*head), __GFP_NOWARN);
-                       if (!head) {
-                               spin_unlock_irqrestore(&p->lock, *flags);
-                               head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
-                               /*
-                                * dropped lock, did GFP_KERNEL allocation,
-                                * check for gp expiration
-                                */
-                               if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
-                                       kvfree(--ptr);
-                                       kfree(head);
-                                       spin_lock_irqsave(&p->lock, *flags);
-                                       return false;
-                               }
-                       }
-               }
-
-               head->func = ptr;
-       }
-again:
-       for (struct rcu_pending_list *i = p->lists;
-            i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
-               if (rcu_gp_poll_cookie_eq(i->seq, seq)) {
-                       rcu_pending_list_add(i, head);
-                       return false;
-               }
-       }
-
-       for (struct rcu_pending_list *i = p->lists;
-            i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
-               if (!i->head) {
-                       i->seq = seq;
-                       rcu_pending_list_add(i, head);
-                       return true;
-               }
-       }
-
-       merge_expired_lists(p);
-       goto again;
-}
-
-/*
- * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
- * pending->pracess) once grace period elapses.
- *
- * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
- * back to a linked list.
- *
- * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
- *   process callback
- *
- * - If @ptr and @head are both not NULL, we're kvfree_rcu()
- *
- * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
- *
- * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
- *   expired items.
- */
-static __always_inline void
-__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
-                     void *ptr, bool may_sleep)
-{
-
-       struct rcu_pending_pcpu *p;
-       struct rcu_pending_seq *objs;
-       struct genradix_node *new_node = NULL;
-       unsigned long flags;
-       bool start_gp = false;
-
-       BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
-
-       /* We could technically be scheduled before taking the lock and end up
-        * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock
-        * anyways
-        *
-        * And we have to do it this way to avoid breaking PREEMPT_RT, which
-        * redefines how spinlocks work:
-        */
-       p = raw_cpu_ptr(pending->p);
-       spin_lock_irqsave(&p->lock, flags);
-       rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
-restart:
-       if (may_sleep &&
-           unlikely(process_finished_items(pending, p, flags)))
-               goto check_expired;
-
-       /*
-        * In kvfree_rcu() mode, the radix tree is only for slab pointers so
-        * that we can do kfree_bulk() - vmalloc pointers always use the linked
-        * list:
-        */
-       if (ptr && unlikely(is_vmalloc_addr(ptr)))
-               goto list_add;
-
-       objs = get_object_radix(p, seq);
-       if (unlikely(!objs))
-               goto list_add;
-
-       if (unlikely(!objs->cursor)) {
-               /*
-                * New radix tree nodes must be added under @p->lock because the
-                * tree root is in a darray that can be resized (typically,
-                * genradix supports concurrent unlocked allocation of new
-                * nodes) - hence preallocation and the retry loop:
-                */
-               objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
-                                               objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
-               if (unlikely(!objs->cursor)) {
-                       if (may_sleep) {
-                               spin_unlock_irqrestore(&p->lock, flags);
-
-                               gfp_t gfp = GFP_KERNEL;
-                               if (!head)
-                                       gfp |= __GFP_NOFAIL;
-
-                               new_node = genradix_alloc_node(gfp);
-                               if (!new_node)
-                                       may_sleep = false;
-                               goto check_expired;
-                       }
-list_add:
-                       start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
-                       goto start_gp;
-               }
-       }
-
-       *objs->cursor++ = ptr ?: head;
-       /* zero cursor if we hit the end of a radix tree node: */
-       if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
-               objs->cursor = NULL;
-       start_gp = !objs->nr;
-       objs->nr++;
-start_gp:
-       if (unlikely(start_gp)) {
-               /*
-                * We only have one callback (ideally, we would have one for
-                * every outstanding graceperiod) - so if our callback is
-                * already in flight, we may still have to start a grace period
-                * (since we used get_state() above, not start_poll())
-                */
-               if (!p->cb_armed) {
-                       p->cb_armed = true;
-                       __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
-               } else {
-                       __start_poll_synchronize_rcu(pending->srcu);
-               }
-       }
-       spin_unlock_irqrestore(&p->lock, flags);
-free_node:
-       if (new_node)
-               genradix_free_node(new_node);
-       return;
-check_expired:
-       if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
-               switch ((ulong) pending->process) {
-               case RCU_PENDING_KVFREE:
-                       kvfree(ptr);
-                       break;
-               case RCU_PENDING_CALL_RCU:
-                       head->func(head);
-                       break;
-               default:
-                       pending->process(pending, head);
-                       break;
-               }
-               goto free_node;
-       }
-
-       p = raw_cpu_ptr(pending->p);
-       spin_lock_irqsave(&p->lock, flags);
-       goto restart;
-}
-
-void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
-{
-       __rcu_pending_enqueue(pending, obj, NULL, true);
-}
-
-static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
-{
-       struct rcu_head *ret = NULL;
-
-       spin_lock_irq(&p->lock);
-       darray_for_each(p->objs, objs)
-               if (objs->nr) {
-                       ret = *genradix_ptr(&objs->objs, --objs->nr);
-                       objs->cursor = NULL;
-                       if (!objs->nr)
-                               genradix_free(&objs->objs);
-                       goto out;
-               }
-
-       static_array_for_each(p->lists, i)
-               if (i->head) {
-                       ret = i->head;
-#ifdef __KERNEL__
-                       i->head = ret->next;
-#else
-                       i->head = (void *) ret->next.next;
-#endif
-                       if (!i->head)
-                               i->tail = NULL;
-                       goto out;
-               }
-out:
-       spin_unlock_irq(&p->lock);
-
-       return ret;
-}
-
-struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
-{
-       return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
-}
-
-struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
-{
-       struct rcu_head *ret = rcu_pending_dequeue(pending);
-
-       if (ret)
-               return ret;
-
-       int cpu;
-       for_each_possible_cpu(cpu) {
-               ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
-               if (ret)
-                       break;
-       }
-       return ret;
-}
-
-static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
-{
-       int cpu;
-       for_each_possible_cpu(cpu) {
-               struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-               spin_lock_irq(&p->lock);
-               if (__rcu_pending_has_pending(p) || p->cb_armed) {
-                       spin_unlock_irq(&p->lock);
-                       return true;
-               }
-               spin_unlock_irq(&p->lock);
-       }
-
-       return false;
-}
-
-void rcu_pending_exit(struct rcu_pending *pending)
-{
-       int cpu;
-
-       if (!pending->p)
-               return;
-
-       while (rcu_pending_has_pending_or_armed(pending)) {
-               __rcu_barrier(pending->srcu);
-
-               for_each_possible_cpu(cpu) {
-                       struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-                       flush_work(&p->work);
-               }
-       }
-
-       for_each_possible_cpu(cpu) {
-               struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-               flush_work(&p->work);
-       }
-
-       for_each_possible_cpu(cpu) {
-               struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-
-               static_array_for_each(p->lists, i)
-                       WARN_ON(i->head);
-               WARN_ON(p->objs.nr);
-               darray_exit(&p->objs);
-       }
-       free_percpu(pending->p);
-}
-
-/**
- * rcu_pending_init: - initialize a rcu_pending
- *
- * @pending:   Object to init
- * @srcu:      May optionally be used with an srcu_struct; if NULL, uses normal
- *             RCU flavor
- * @process:   Callback function invoked on objects once their RCU barriers
- *             have completed; if NULL, kvfree() is used.
- */
-int rcu_pending_init(struct rcu_pending *pending,
-                    struct srcu_struct *srcu,
-                    rcu_pending_process_fn process)
-{
-       pending->p = alloc_percpu(struct rcu_pending_pcpu);
-       if (!pending->p)
-               return -ENOMEM;
-
-       int cpu;
-       for_each_possible_cpu(cpu) {
-               struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-               p->parent       = pending;
-               p->cpu          = cpu;
-               spin_lock_init(&p->lock);
-               darray_init(&p->objs);
-               INIT_WORK(&p->work, rcu_pending_work);
-       }
-
-       pending->srcu = srcu;
-       pending->process = process;
-
-       return 0;
-}
diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
deleted file mode 100644 (file)
index 71a2f4d..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_RCU_PENDING_H
-#define _LINUX_RCU_PENDING_H
-
-#include <linux/rcupdate.h>
-
-struct rcu_pending;
-typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
-
-struct rcu_pending_pcpu;
-
-struct rcu_pending {
-       struct rcu_pending_pcpu __percpu *p;
-       struct srcu_struct              *srcu;
-       rcu_pending_process_fn          process;
-};
-
-void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
-struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
-struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
-
-void rcu_pending_exit(struct rcu_pending *pending);
-int rcu_pending_init(struct rcu_pending *pending,
-                    struct srcu_struct *srcu,
-                    rcu_pending_process_fn process);
-
-#endif /* _LINUX_RCU_PENDING_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
deleted file mode 100644 (file)
index 1c345b8..0000000
+++ /dev/null
@@ -1,889 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "clock.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "io_write.h"
-#include "move.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/sched/cputime.h>
-
-/* bch_extent_rebalance: */
-
-static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
-{
-       const union bch_extent_entry *entry;
-
-       bkey_extent_entry_for_each(ptrs, entry)
-               if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
-                       return &entry->rebalance;
-
-       return NULL;
-}
-
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
-{
-       return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
-}
-
-static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
-                                          struct bch_io_opts *opts,
-                                          struct bkey_s_c k,
-                                          struct bkey_ptrs_c ptrs)
-{
-       if (!opts->background_compression)
-               return 0;
-
-       unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       unsigned ptr_bit = 1;
-       unsigned rewrite_ptrs = 0;
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
-                   p.ptr.unwritten)
-                       return 0;
-
-               if (!p.ptr.cached && p.crc.compression_type != compression_type)
-                       rewrite_ptrs |= ptr_bit;
-               ptr_bit <<= 1;
-       }
-
-       return rewrite_ptrs;
-}
-
-static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
-                                      struct bch_io_opts *opts,
-                                      struct bkey_ptrs_c ptrs)
-{
-       if (!opts->background_target ||
-           !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
-               return 0;
-
-       unsigned ptr_bit = 1;
-       unsigned rewrite_ptrs = 0;
-
-       guard(rcu)();
-       bkey_for_each_ptr(ptrs, ptr) {
-               if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
-                       rewrite_ptrs |= ptr_bit;
-               ptr_bit <<= 1;
-       }
-
-       return rewrite_ptrs;
-}
-
-static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
-                                             struct bch_io_opts *opts,
-                                             struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-       if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
-               return 0;
-
-       return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
-               bch2_bkey_ptrs_need_move(c, opts, ptrs);
-}
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-       const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
-       if (!opts)
-               return 0;
-
-       if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
-               return 0;
-
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       u64 sectors = 0;
-
-       if (opts->background_compression) {
-               unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
-
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-                       if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
-                           p.ptr.unwritten) {
-                               sectors = 0;
-                               goto incompressible;
-                       }
-
-                       if (!p.ptr.cached && p.crc.compression_type != compression_type)
-                               sectors += p.crc.compressed_size;
-               }
-       }
-incompressible:
-       if (opts->background_target) {
-               guard(rcu)();
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-                       if (!p.ptr.cached &&
-                           !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
-                               sectors += p.crc.compressed_size;
-       }
-
-       return sectors;
-}
-
-static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
-                                            struct bkey_s_c k)
-{
-       if (!bkey_extent_is_direct_data(k.k))
-               return 0;
-
-       const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
-
-       if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
-               struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
-               return old == NULL || memcmp(old, &new, sizeof(new));
-       } else {
-               return old != NULL;
-       }
-}
-
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
-                                 struct bkey_i *_k)
-{
-       if (!bkey_extent_is_direct_data(&_k->k))
-               return 0;
-
-       struct bkey_s k = bkey_i_to_s(_k);
-       struct bch_extent_rebalance *old =
-               (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
-
-       if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
-               if (!old) {
-                       old = bkey_val_end(k);
-                       k.k->u64s += sizeof(*old) / sizeof(u64);
-               }
-
-               *old = io_opts_to_rebalance_opts(c, opts);
-       } else {
-               if (old)
-                       extent_entry_drop(k, (union bch_extent_entry *) old);
-       }
-
-       return 0;
-}
-
-int bch2_get_update_rebalance_opts(struct btree_trans *trans,
-                                  struct bch_io_opts *io_opts,
-                                  struct btree_iter *iter,
-                                  struct bkey_s_c k)
-{
-       BUG_ON(iter->flags & BTREE_ITER_is_extents);
-       BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
-
-       const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
-               ? bch2_bkey_rebalance_opts(k) : NULL;
-       if (r) {
-#define x(_name)                                                       \
-               if (r->_name##_from_inode) {                            \
-                       io_opts->_name = r->_name;                      \
-                       io_opts->_name##_from_inode = true;             \
-               }
-               BCH_REBALANCE_OPTS()
-#undef x
-       }
-
-       if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
-               return 0;
-
-       struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
-       int ret = PTR_ERR_OR_ZERO(n);
-       if (ret)
-               return ret;
-
-       bkey_reassemble(n, k);
-
-       /* On successfull transaction commit, @k was invalidated: */
-
-       return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
-               bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
-               bch2_trans_commit(trans, NULL, NULL, 0) ?:
-               -BCH_ERR_transaction_restart_nested;
-}
-
-#define REBALANCE_WORK_SCAN_OFFSET     (U64_MAX - 1)
-
-static const char * const bch2_rebalance_state_strs[] = {
-#define x(t) #t,
-       BCH_REBALANCE_STATES()
-       NULL
-#undef x
-};
-
-int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_i_cookie *cookie;
-       u64 v;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
-                            SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
-                            BTREE_ITER_intent);
-       k = bch2_btree_iter_peek_slot(trans, &iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       v = k.k->type == KEY_TYPE_cookie
-               ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
-               : 0;
-
-       cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
-       ret = PTR_ERR_OR_ZERO(cookie);
-       if (ret)
-               goto err;
-
-       bkey_cookie_init(&cookie->k_i);
-       cookie->k.p = iter.pos;
-       cookie->v.cookie = cpu_to_le64(v + 1);
-
-       ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
-{
-       int ret = bch2_trans_commit_do(c, NULL, NULL,
-                                      BCH_TRANS_COMMIT_no_enospc,
-                           bch2_set_rebalance_needs_scan_trans(trans, inum));
-       bch2_rebalance_wakeup(c);
-       return ret;
-}
-
-int bch2_set_fs_needs_rebalance(struct bch_fs *c)
-{
-       return bch2_set_rebalance_needs_scan(c, 0);
-}
-
-static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u64 v;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
-                            SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
-                            BTREE_ITER_intent);
-       k = bch2_btree_iter_peek_slot(trans, &iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       v = k.k->type == KEY_TYPE_cookie
-               ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
-               : 0;
-
-       if (v == cookie)
-               ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
-                                           struct btree_iter *work_iter)
-{
-       return !kthread_should_stop()
-               ? bch2_btree_iter_peek(trans, work_iter)
-               : bkey_s_c_null;
-}
-
-static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
-                                          struct btree_iter *iter,
-                                          struct bkey_s_c k)
-{
-       if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
-               return 0;
-
-       struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
-       int ret = PTR_ERR_OR_ZERO(n);
-       if (ret)
-               return ret;
-
-       extent_entry_drop(bkey_i_to_s(n),
-                         (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
-       return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
-                       struct bpos work_pos,
-                       struct btree_iter *extent_iter,
-                       struct bch_io_opts *io_opts,
-                       struct data_update_opts *data_opts)
-{
-       struct bch_fs *c = trans->c;
-
-       bch2_trans_iter_exit(trans, extent_iter);
-       bch2_trans_iter_init(trans, extent_iter,
-                            work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
-                            work_pos,
-                            BTREE_ITER_all_snapshots);
-       struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
-       if (bkey_err(k))
-               return k;
-
-       int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
-       if (ret)
-               return bkey_s_c_err(ret);
-
-       memset(data_opts, 0, sizeof(*data_opts));
-       data_opts->rewrite_ptrs         = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
-       data_opts->target               = io_opts->background_target;
-       data_opts->write_flags          |= BCH_WRITE_only_specified_devs;
-
-       if (!data_opts->rewrite_ptrs) {
-               /*
-                * device we would want to write to offline? devices in target
-                * changed?
-                *
-                * We'll now need a full scan before this extent is picked up
-                * again:
-                */
-               int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
-               if (ret)
-                       return bkey_s_c_err(ret);
-               return bkey_s_c_null;
-       }
-
-       if (trace_rebalance_extent_enabled()) {
-               struct printbuf buf = PRINTBUF;
-
-               bch2_bkey_val_to_text(&buf, c, k);
-               prt_newline(&buf);
-
-               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-               unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
-               if (p) {
-                       prt_str(&buf, "compression=");
-                       bch2_compression_opt_to_text(&buf, io_opts->background_compression);
-                       prt_str(&buf, " ");
-                       bch2_prt_u64_base2(&buf, p);
-                       prt_newline(&buf);
-               }
-
-               p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
-               if (p) {
-                       prt_str(&buf, "move=");
-                       bch2_target_to_text(&buf, c, io_opts->background_target);
-                       prt_str(&buf, " ");
-                       bch2_prt_u64_base2(&buf, p);
-                       prt_newline(&buf);
-               }
-
-               trace_rebalance_extent(c, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       return k;
-}
-
-noinline_for_stack
-static int do_rebalance_extent(struct moving_context *ctxt,
-                              struct bpos work_pos,
-                              struct btree_iter *extent_iter)
-{
-       struct btree_trans *trans = ctxt->trans;
-       struct bch_fs *c = trans->c;
-       struct bch_fs_rebalance *r = &trans->c->rebalance;
-       struct data_update_opts data_opts;
-       struct bch_io_opts io_opts;
-       struct bkey_s_c k;
-       struct bkey_buf sk;
-       int ret;
-
-       ctxt->stats = &r->work_stats;
-       r->state = BCH_REBALANCE_working;
-
-       bch2_bkey_buf_init(&sk);
-
-       ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
-                               extent_iter, &io_opts, &data_opts));
-       if (ret || !k.k)
-               goto out;
-
-       atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
-
-       /*
-        * The iterator gets unlocked by __bch2_read_extent - need to
-        * save a copy of @k elsewhere:
-        */
-       bch2_bkey_buf_reassemble(&sk, c, k);
-       k = bkey_i_to_s_c(sk.k);
-
-       ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
-       if (ret) {
-               if (bch2_err_matches(ret, ENOMEM)) {
-                       /* memory allocation failure, wait for some IO to finish */
-                       bch2_move_ctxt_wait_for_io(ctxt);
-                       ret = bch_err_throw(c, transaction_restart_nested);
-               }
-
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       goto out;
-
-               /* skip it and continue, XXX signal failure */
-               ret = 0;
-       }
-out:
-       bch2_bkey_buf_exit(&sk, c);
-       return ret;
-}
-
-static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
-{
-       struct btree_trans *trans = ctxt->trans;
-       struct bch_fs *c = trans->c;
-       struct bch_fs_rebalance *r = &trans->c->rebalance;
-
-       bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
-       ctxt->stats = &r->scan_stats;
-
-       if (!inum) {
-               r->scan_start   = BBPOS_MIN;
-               r->scan_end     = BBPOS_MAX;
-       } else {
-               r->scan_start   = BBPOS(BTREE_ID_extents, POS(inum, 0));
-               r->scan_end     = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
-       }
-
-       r->state = BCH_REBALANCE_scanning;
-
-       struct per_snapshot_io_opts snapshot_io_opts;
-       per_snapshot_io_opts_init(&snapshot_io_opts, c);
-
-       int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
-                                     r->scan_start.pos, r->scan_end.pos,
-                                     BTREE_ITER_all_snapshots|
-                                     BTREE_ITER_not_extents|
-                                     BTREE_ITER_prefetch, k, ({
-               ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
-
-               struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
-                                       &snapshot_io_opts, iter.pos, &iter, k);
-               PTR_ERR_OR_ZERO(io_opts);
-       })) ?:
-       commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                 bch2_clear_rebalance_needs_scan(trans, inum, cookie));
-
-       per_snapshot_io_opts_exit(&snapshot_io_opts);
-       bch2_move_stats_exit(&r->scan_stats, trans->c);
-
-       /*
-        * Ensure that the rebalance_work entries we created are seen by the
-        * next iteration of do_rebalance(), so we don't end up stuck in
-        * rebalance_wait():
-        */
-       atomic64_inc(&r->scan_stats.sectors_seen);
-       bch2_btree_write_buffer_flush_sync(trans);
-
-       return ret;
-}
-
-static void rebalance_wait(struct bch_fs *c)
-{
-       struct bch_fs_rebalance *r = &c->rebalance;
-       struct io_clock *clock = &c->io_clock[WRITE];
-       u64 now = atomic64_read(&clock->now);
-       u64 min_member_capacity = bch2_min_rw_member_capacity(c);
-
-       if (min_member_capacity == U64_MAX)
-               min_member_capacity = 128 * 2048;
-
-       r->wait_iotime_end              = now + (min_member_capacity >> 6);
-
-       if (r->state != BCH_REBALANCE_waiting) {
-               r->wait_iotime_start    = now;
-               r->wait_wallclock_start = ktime_get_real_ns();
-               r->state                = BCH_REBALANCE_waiting;
-       }
-
-       bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
-}
-
-static bool bch2_rebalance_enabled(struct bch_fs *c)
-{
-       return c->opts.rebalance_enabled &&
-               !(c->opts.rebalance_on_ac_only &&
-                 c->rebalance.on_battery);
-}
-
-static int do_rebalance(struct moving_context *ctxt)
-{
-       struct btree_trans *trans = ctxt->trans;
-       struct bch_fs *c = trans->c;
-       struct bch_fs_rebalance *r = &c->rebalance;
-       struct btree_iter rebalance_work_iter, extent_iter = {};
-       struct bkey_s_c k;
-       u32 kick = r->kick;
-       int ret = 0;
-
-       bch2_trans_begin(trans);
-
-       bch2_move_stats_init(&r->work_stats, "rebalance_work");
-       bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
-
-       bch2_trans_iter_init(trans, &rebalance_work_iter,
-                            BTREE_ID_rebalance_work, POS_MIN,
-                            BTREE_ITER_all_snapshots);
-
-       while (!bch2_move_ratelimit(ctxt)) {
-               if (!bch2_rebalance_enabled(c)) {
-                       bch2_moving_ctxt_flush_all(ctxt);
-                       kthread_wait_freezable(bch2_rebalance_enabled(c) ||
-                                              kthread_should_stop());
-               }
-
-               if (kthread_should_stop())
-                       break;
-
-               bch2_trans_begin(trans);
-
-               ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret || !k.k)
-                       break;
-
-               ret = k.k->type == KEY_TYPE_cookie
-                       ? do_rebalance_scan(ctxt, k.k->p.inode,
-                                           le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
-                       : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
-
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       break;
-
-               bch2_btree_iter_advance(trans, &rebalance_work_iter);
-       }
-
-       bch2_trans_iter_exit(trans, &extent_iter);
-       bch2_trans_iter_exit(trans, &rebalance_work_iter);
-       bch2_move_stats_exit(&r->scan_stats, c);
-
-       if (!ret &&
-           !kthread_should_stop() &&
-           !atomic64_read(&r->work_stats.sectors_seen) &&
-           !atomic64_read(&r->scan_stats.sectors_seen) &&
-           kick == r->kick) {
-               bch2_moving_ctxt_flush_all(ctxt);
-               bch2_trans_unlock_long(trans);
-               rebalance_wait(c);
-       }
-
-       if (!bch2_err_matches(ret, EROFS))
-               bch_err_fn(c, ret);
-       return ret;
-}
-
-static int bch2_rebalance_thread(void *arg)
-{
-       struct bch_fs *c = arg;
-       struct bch_fs_rebalance *r = &c->rebalance;
-       struct moving_context ctxt;
-
-       set_freezable();
-
-       /*
-        * Data move operations can't run until after check_snapshots has
-        * completed, and bch2_snapshot_is_ancestor() is available.
-        */
-       kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
-                              kthread_should_stop());
-
-       bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
-                             writepoint_ptr(&c->rebalance_write_point),
-                             true);
-
-       while (!kthread_should_stop() && !do_rebalance(&ctxt))
-               ;
-
-       bch2_moving_ctxt_exit(&ctxt);
-
-       return 0;
-}
-
-void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       printbuf_tabstop_push(out, 32);
-
-       struct bch_fs_rebalance *r = &c->rebalance;
-
-       /* print pending work */
-       struct disk_accounting_pos acc;
-       disk_accounting_key_init(acc, rebalance_work);
-       u64 v;
-       bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
-
-       prt_printf(out, "pending work:\t");
-       prt_human_readable_u64(out, v << 9);
-       prt_printf(out, "\n\n");
-
-       prt_str(out, bch2_rebalance_state_strs[r->state]);
-       prt_newline(out);
-       printbuf_indent_add(out, 2);
-
-       switch (r->state) {
-       case BCH_REBALANCE_waiting: {
-               u64 now = atomic64_read(&c->io_clock[WRITE].now);
-
-               prt_printf(out, "io wait duration:\t");
-               bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
-               prt_newline(out);
-
-               prt_printf(out, "io wait remaining:\t");
-               bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
-               prt_newline(out);
-
-               prt_printf(out, "duration waited:\t");
-               bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
-               prt_newline(out);
-               break;
-       }
-       case BCH_REBALANCE_working:
-               bch2_move_stats_to_text(out, &r->work_stats);
-               break;
-       case BCH_REBALANCE_scanning:
-               bch2_move_stats_to_text(out, &r->scan_stats);
-               break;
-       }
-       prt_newline(out);
-
-       struct task_struct *t;
-       scoped_guard(rcu) {
-               t = rcu_dereference(c->rebalance.thread);
-               if (t)
-                       get_task_struct(t);
-       }
-
-       if (t) {
-               bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
-               put_task_struct(t);
-       }
-
-       printbuf_indent_sub(out, 2);
-}
-
-void bch2_rebalance_stop(struct bch_fs *c)
-{
-       struct task_struct *p;
-
-       c->rebalance.pd.rate.rate = UINT_MAX;
-       bch2_ratelimit_reset(&c->rebalance.pd.rate);
-
-       p = rcu_dereference_protected(c->rebalance.thread, 1);
-       c->rebalance.thread = NULL;
-
-       if (p) {
-               /* for sychronizing with bch2_rebalance_wakeup() */
-               synchronize_rcu();
-
-               kthread_stop(p);
-               put_task_struct(p);
-       }
-}
-
-int bch2_rebalance_start(struct bch_fs *c)
-{
-       struct task_struct *p;
-       int ret;
-
-       if (c->rebalance.thread)
-               return 0;
-
-       if (c->opts.nochanges)
-               return 0;
-
-       p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
-       ret = PTR_ERR_OR_ZERO(p);
-       bch_err_msg(c, ret, "creating rebalance thread");
-       if (ret)
-               return ret;
-
-       get_task_struct(p);
-       rcu_assign_pointer(c->rebalance.thread, p);
-       wake_up_process(p);
-       return 0;
-}
-
-#ifdef CONFIG_POWER_SUPPLY
-#include <linux/power_supply.h>
-
-static int bch2_rebalance_power_notifier(struct notifier_block *nb,
-                                        unsigned long event, void *data)
-{
-       struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
-
-       c->rebalance.on_battery = !power_supply_is_system_supplied();
-       bch2_rebalance_wakeup(c);
-       return NOTIFY_OK;
-}
-#endif
-
-void bch2_fs_rebalance_exit(struct bch_fs *c)
-{
-#ifdef CONFIG_POWER_SUPPLY
-       power_supply_unreg_notifier(&c->rebalance.power_notifier);
-#endif
-}
-
-int bch2_fs_rebalance_init(struct bch_fs *c)
-{
-       struct bch_fs_rebalance *r = &c->rebalance;
-
-       bch2_pd_controller_init(&r->pd);
-
-#ifdef CONFIG_POWER_SUPPLY
-       r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
-       int ret = power_supply_reg_notifier(&r->power_notifier);
-       if (ret)
-               return ret;
-
-       r->on_battery = !power_supply_is_system_supplied();
-#endif
-       return 0;
-}
-
-static int check_rebalance_work_one(struct btree_trans *trans,
-                                   struct btree_iter *extent_iter,
-                                   struct btree_iter *rebalance_iter,
-                                   struct bkey_buf *last_flushed)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_s_c extent_k, rebalance_k;
-       struct printbuf buf = PRINTBUF;
-
-       int ret = bkey_err(extent_k     = bch2_btree_iter_peek(trans, extent_iter)) ?:
-                 bkey_err(rebalance_k  = bch2_btree_iter_peek(trans, rebalance_iter));
-       if (ret)
-               return ret;
-
-       if (!extent_k.k &&
-           extent_iter->btree_id == BTREE_ID_reflink &&
-           (!rebalance_k.k ||
-            rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
-               bch2_trans_iter_exit(trans, extent_iter);
-               bch2_trans_iter_init(trans, extent_iter,
-                                    BTREE_ID_extents, POS_MIN,
-                                    BTREE_ITER_prefetch|
-                                    BTREE_ITER_all_snapshots);
-               return bch_err_throw(c, transaction_restart_nested);
-       }
-
-       if (!extent_k.k && !rebalance_k.k)
-               return 1;
-
-       int cmp = bpos_cmp(extent_k.k    ? extent_k.k->p    : SPOS_MAX,
-                          rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
-
-       struct bkey deleted;
-       bkey_init(&deleted);
-
-       if (cmp < 0) {
-               deleted.p = extent_k.k->p;
-               rebalance_k.k = &deleted;
-       } else if (cmp > 0) {
-               deleted.p = rebalance_k.k->p;
-               extent_k.k = &deleted;
-       }
-
-       bool should_have_rebalance =
-               bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
-       bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
-
-       if (should_have_rebalance != have_rebalance) {
-               ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
-               if (ret)
-                       return ret;
-
-               bch2_bkey_val_to_text(&buf, c, extent_k);
-       }
-
-       if (fsck_err_on(!should_have_rebalance && have_rebalance,
-                       trans, rebalance_work_incorrectly_set,
-                       "rebalance work incorrectly set\n%s", buf.buf)) {
-               ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
-                                                 extent_k.k->p, false);
-               if (ret)
-                       goto err;
-       }
-
-       if (fsck_err_on(should_have_rebalance && !have_rebalance,
-                       trans, rebalance_work_incorrectly_unset,
-                       "rebalance work incorrectly unset\n%s", buf.buf)) {
-               ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
-                                                 extent_k.k->p, true);
-               if (ret)
-                       goto err;
-       }
-
-       if (cmp <= 0)
-               bch2_btree_iter_advance(trans, extent_iter);
-       if (cmp >= 0)
-               bch2_btree_iter_advance(trans, rebalance_iter);
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_check_rebalance_work(struct bch_fs *c)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter rebalance_iter, extent_iter;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, &extent_iter,
-                            BTREE_ID_reflink, POS_MIN,
-                            BTREE_ITER_prefetch);
-       bch2_trans_iter_init(trans, &rebalance_iter,
-                            BTREE_ID_rebalance_work, POS_MIN,
-                            BTREE_ITER_prefetch);
-
-       struct bkey_buf last_flushed;
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-
-       while (!ret) {
-               bch2_trans_begin(trans);
-
-               ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
-
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       ret = 0;
-       }
-
-       bch2_bkey_buf_exit(&last_flushed, c);
-       bch2_trans_iter_exit(trans, &extent_iter);
-       bch2_trans_iter_exit(trans, &rebalance_iter);
-       bch2_trans_put(trans);
-       return ret < 0 ? ret : 0;
-}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
deleted file mode 100644 (file)
index 7a565ea..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_H
-#define _BCACHEFS_REBALANCE_H
-
-#include "compress.h"
-#include "disk_groups.h"
-#include "opts.h"
-#include "rebalance_types.h"
-
-static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
-                                                                   struct bch_io_opts *opts)
-{
-       struct bch_extent_rebalance r = {
-               .type = BIT(BCH_EXTENT_ENTRY_rebalance),
-#define x(_name)                                                       \
-               ._name = opts->_name,                                   \
-               ._name##_from_inode = opts->_name##_from_inode,
-               BCH_REBALANCE_OPTS()
-#undef x
-       };
-
-       if (r.background_target &&
-           !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
-               r.background_target = 0;
-
-       return r;
-};
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
-int bch2_get_update_rebalance_opts(struct btree_trans *,
-                                  struct bch_io_opts *,
-                                  struct btree_iter *,
-                                  struct bkey_s_c);
-
-int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
-int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
-int bch2_set_fs_needs_rebalance(struct bch_fs *);
-
-static inline void bch2_rebalance_wakeup(struct bch_fs *c)
-{
-       c->rebalance.kick++;
-       guard(rcu)();
-       struct task_struct *p = rcu_dereference(c->rebalance.thread);
-       if (p)
-               wake_up_process(p);
-}
-
-void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_rebalance_stop(struct bch_fs *);
-int bch2_rebalance_start(struct bch_fs *);
-
-void bch2_fs_rebalance_exit(struct bch_fs *);
-int bch2_fs_rebalance_init(struct bch_fs *);
-
-int bch2_check_rebalance_work(struct bch_fs *);
-
-#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
deleted file mode 100644 (file)
index ff9a134..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_FORMAT_H
-#define _BCACHEFS_REBALANCE_FORMAT_H
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:6,
-                               unused:3,
-
-                               promote_target_from_inode:1,
-                               erasure_code_from_inode:1,
-                               data_checksum_from_inode:1,
-                               background_compression_from_inode:1,
-                               data_replicas_from_inode:1,
-                               background_target_from_inode:1,
-
-                               promote_target:16,
-                               erasure_code:1,
-                               data_checksum:4,
-                               data_replicas:4,
-                               background_compression:8, /* enum bch_compression_opt */
-                               background_target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   background_target:16,
-                               background_compression:8,
-                               data_replicas:4,
-                               data_checksum:4,
-                               erasure_code:1,
-                               promote_target:16,
-
-                               background_target_from_inode:1,
-                               data_replicas_from_inode:1,
-                               background_compression_from_inode:1,
-                               data_checksum_from_inode:1,
-                               erasure_code_from_inode:1,
-                               promote_target_from_inode:1,
-
-                               unused:3,
-                               type:6;
-#endif
-};
-
-/* subset of BCH_INODE_OPTS */
-#define BCH_REBALANCE_OPTS()                   \
-       x(data_checksum)                        \
-       x(background_compression)               \
-       x(data_replicas)                        \
-       x(promote_target)                       \
-       x(background_target)                    \
-       x(erasure_code)
-
-#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
-
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
deleted file mode 100644 (file)
index c659da1..0000000
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_TYPES_H
-#define _BCACHEFS_REBALANCE_TYPES_H
-
-#include "bbpos_types.h"
-#include "move_types.h"
-
-#define BCH_REBALANCE_STATES()         \
-       x(waiting)                      \
-       x(working)                      \
-       x(scanning)
-
-enum bch_rebalance_states {
-#define x(t)   BCH_REBALANCE_##t,
-       BCH_REBALANCE_STATES()
-#undef x
-};
-
-struct bch_fs_rebalance {
-       struct task_struct __rcu        *thread;
-       u32                             kick;
-       struct bch_pd_controller pd;
-
-       enum bch_rebalance_states       state;
-       u64                             wait_iotime_start;
-       u64                             wait_iotime_end;
-       u64                             wait_wallclock_start;
-
-       struct bch_move_stats           work_stats;
-
-       struct bbpos                    scan_start;
-       struct bbpos                    scan_end;
-       struct bch_move_stats           scan_stats;
-
-       bool                            on_battery;
-#ifdef CONFIG_POWER_SUPPLY
-       struct notifier_block           power_notifier;
-#endif
-};
-
-#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
deleted file mode 100644 (file)
index c94debb..0000000
+++ /dev/null
@@ -1,1306 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "bkey_buf.h"
-#include "btree_journal_iter.h"
-#include "btree_node_scan.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "buckets.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "logged_ops.h"
-#include "move.h"
-#include "movinggc.h"
-#include "namei.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "sb-downgrade.h"
-#include "snapshot.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-#include <linux/stat.h>
-
-int bch2_btree_lost_data(struct bch_fs *c,
-                        struct printbuf *msg,
-                        enum btree_id btree)
-{
-       u64 b = BIT_ULL(btree);
-       int ret = 0;
-
-       mutex_lock(&c->sb_lock);
-       struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
-       if (!(c->sb.btrees_lost_data & b)) {
-               prt_printf(msg, "flagging btree ");
-               bch2_btree_id_to_text(msg, btree);
-               prt_printf(msg, " lost data\n");
-
-               ext->btrees_lost_data |= cpu_to_le64(b);
-       }
-
-       /* Once we have runtime self healing for topology errors we won't need this: */
-       ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
-
-       /* Btree node accounting will be off: */
-       __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
-       ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       /*
-        * These are much more minor, and don't need to be corrected right away,
-        * but in debug mode we want the next fsck run to be clean:
-        */
-       ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret;
-       ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret;
-#endif
-
-       switch (btree) {
-       case BTREE_ID_alloc:
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-
-               __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
-               __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
-               __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
-               __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
-               __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
-               __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
-               goto out;
-       case BTREE_ID_backpointers:
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret;
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret;
-               goto out;
-       case BTREE_ID_need_discard:
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-               goto out;
-       case BTREE_ID_freespace:
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-               goto out;
-       case BTREE_ID_bucket_gens:
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-               goto out;
-       case BTREE_ID_lru:
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-               goto out;
-       case BTREE_ID_accounting:
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
-               goto out;
-       case BTREE_ID_snapshots:
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret;
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
-               goto out;
-       default:
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
-               ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
-               goto out;
-       }
-out:
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       return ret;
-}
-
-static void kill_btree(struct bch_fs *c, enum btree_id btree)
-{
-       bch2_btree_id_root(c, btree)->alive = false;
-       bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-}
-
-/* for -o reconstruct_alloc: */
-void bch2_reconstruct_alloc(struct bch_fs *c)
-{
-       mutex_lock(&c->sb_lock);
-       struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
-       __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
-       __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
-       __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
-       __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
-       __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
-
-       __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
-
-       __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent);
-
-       __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
-
-       __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent);
-
-       __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
-       __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
-       c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-
-       c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
-       c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info));
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       for (unsigned i = 0; i < btree_id_nr_alive(c); i++)
-               if (btree_id_is_alloc(i))
-                       kill_btree(c, i);
-}
-
-/*
- * Btree node pointers have a field to stack a pointer to the in memory btree
- * node; we need to zero out this field when reading in btree nodes, or when
- * reading in keys from the journal:
- */
-static void zero_out_btree_mem_ptr(struct journal_keys *keys)
-{
-       darray_for_each(*keys, i)
-               if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
-                       bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
-}
-
-/* journal replay: */
-
-static void replay_now_at(struct journal *j, u64 seq)
-{
-       BUG_ON(seq < j->replay_journal_seq);
-
-       seq = min(seq, j->replay_journal_seq_end);
-
-       while (j->replay_journal_seq < seq)
-               bch2_journal_pin_put(j, j->replay_journal_seq++);
-}
-
-static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
-                                             struct journal_key *k)
-{
-       struct btree_iter iter;
-       bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
-                                 BTREE_MAX_DEPTH, k->level,
-                                 BTREE_ITER_intent);
-       int ret = bch2_btree_iter_traverse(trans, &iter);
-       if (ret)
-               goto out;
-
-       struct bkey u;
-       struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
-
-       /* Has this delta already been applied to the btree? */
-       if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
-               ret = 0;
-               goto out;
-       }
-
-       struct bkey_i *new = k->k;
-       if (old.k->type == KEY_TYPE_accounting) {
-               new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
-               ret = PTR_ERR_OR_ZERO(new);
-               if (ret)
-                       goto out;
-
-               bch2_accounting_accumulate(bkey_i_to_accounting(new),
-                                          bkey_s_c_to_accounting(old));
-       }
-
-       trans->journal_res.seq = k->journal_seq;
-
-       ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int bch2_journal_replay_key(struct btree_trans *trans,
-                                  struct journal_key *k)
-{
-       struct btree_iter iter;
-       unsigned iter_flags =
-               BTREE_ITER_intent|
-               BTREE_ITER_not_extents;
-       unsigned update_flags = BTREE_TRIGGER_norun;
-       int ret;
-
-       if (k->overwritten)
-               return 0;
-
-       trans->journal_res.seq = k->journal_seq;
-
-       /*
-        * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
-        * keep the key cache coherent with the underlying btree. Nothing
-        * besides the allocator is doing updates yet so we don't need key cache
-        * coherency for non-alloc btrees, and key cache fills for snapshots
-        * btrees use BTREE_ITER_filter_snapshots, which isn't available until
-        * the snapshots recovery pass runs.
-        */
-       if (!k->level && k->btree_id == BTREE_ID_alloc)
-               iter_flags |= BTREE_ITER_cached;
-       else
-               update_flags |= BTREE_UPDATE_key_cache_reclaim;
-
-       bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
-                                 BTREE_MAX_DEPTH, k->level,
-                                 iter_flags);
-       ret = bch2_btree_iter_traverse(trans, &iter);
-       if (ret)
-               goto out;
-
-       struct btree_path *path = btree_iter_path(trans, &iter);
-       if (unlikely(!btree_path_node(path, k->level))) {
-               struct bch_fs *c = trans->c;
-
-               CLASS(printbuf, buf)();
-               prt_str(&buf, "btree=");
-               bch2_btree_id_to_text(&buf, k->btree_id);
-               prt_printf(&buf, " level=%u ", k->level);
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
-
-               if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
-                                                    BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
-                       bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s",
-                               buf.buf);
-                       ret = -EINVAL;
-               }
-
-               if (!k->allocated) {
-                       bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s",
-                                  buf.buf);
-                       k->overwritten = true;
-                       goto out;
-               }
-
-               bch2_trans_iter_exit(trans, &iter);
-               bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
-                                         BTREE_MAX_DEPTH, 0, iter_flags);
-               ret =   bch2_btree_iter_traverse(trans, &iter) ?:
-                       bch2_btree_increase_depth(trans, iter.path, 0) ?:
-                       -BCH_ERR_transaction_restart_nested;
-               goto out;
-       }
-
-       /* Must be checked with btree locked: */
-       if (k->overwritten)
-               goto out;
-
-       if (k->k->k.type == KEY_TYPE_accounting) {
-               struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s);
-               ret = PTR_ERR_OR_ZERO(n);
-               if (ret)
-                       goto out;
-
-               bkey_copy(n, k->k);
-               goto out;
-       }
-
-       ret = bch2_trans_update(trans, &iter, k->k, update_flags);
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int journal_sort_seq_cmp(const void *_l, const void *_r)
-{
-       const struct journal_key *l = *((const struct journal_key **)_l);
-       const struct journal_key *r = *((const struct journal_key **)_r);
-
-       /*
-        * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
-        *
-        * journal_seq == 0 means that the key comes from early repair, and
-        * should be inserted last so as to avoid overflowing the journal
-        */
-       return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
-}
-
-int bch2_journal_replay(struct bch_fs *c)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       DARRAY(struct journal_key *) keys_sorted = { 0 };
-       struct journal *j = &c->journal;
-       u64 start_seq   = c->journal_replay_seq_start;
-       u64 end_seq     = c->journal_replay_seq_start;
-       struct btree_trans *trans = NULL;
-       bool immediate_flush = false;
-       int ret = 0;
-
-       if (keys->nr) {
-               ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
-                                          keys->nr, start_seq, end_seq);
-               if (ret)
-                       goto err;
-       }
-
-       BUG_ON(!atomic_read(&keys->ref));
-
-       move_gap(keys, keys->nr);
-       trans = bch2_trans_get(c);
-
-       /*
-        * Replay accounting keys first: we can't allow the write buffer to
-        * flush accounting keys until we're done
-        */
-       darray_for_each(*keys, k) {
-               if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
-                       continue;
-
-               cond_resched();
-
-               ret = commit_do(trans, NULL, NULL,
-                               BCH_TRANS_COMMIT_no_enospc|
-                               BCH_TRANS_COMMIT_journal_reclaim|
-                               BCH_TRANS_COMMIT_skip_accounting_apply|
-                               BCH_TRANS_COMMIT_no_journal_res|
-                               BCH_WATERMARK_reclaim,
-                            bch2_journal_replay_accounting_key(trans, k));
-               if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
-                       goto err;
-
-               k->overwritten = true;
-       }
-
-       set_bit(BCH_FS_accounting_replay_done, &c->flags);
-
-       /*
-        * First, attempt to replay keys in sorted order. This is more
-        * efficient - better locality of btree access -  but some might fail if
-        * that would cause a journal deadlock.
-        */
-       darray_for_each(*keys, k) {
-               cond_resched();
-
-               /*
-                * k->allocated means the key wasn't read in from the journal,
-                * rather it was from early repair code
-                */
-               if (k->allocated)
-                       immediate_flush = true;
-
-               /* Skip fastpath if we're low on space in the journal */
-               ret = c->journal.watermark ? -1 :
-                       commit_do(trans, NULL, NULL,
-                                 BCH_TRANS_COMMIT_no_enospc|
-                                 BCH_TRANS_COMMIT_journal_reclaim|
-                                 BCH_TRANS_COMMIT_skip_accounting_apply|
-                                 (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
-                            bch2_journal_replay_key(trans, k));
-               BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
-               if (ret) {
-                       ret = darray_push(&keys_sorted, k);
-                       if (ret)
-                               goto err;
-               }
-       }
-
-       bch2_trans_unlock_long(trans);
-       /*
-        * Now, replay any remaining keys in the order in which they appear in
-        * the journal, unpinning those journal entries as we go:
-        */
-       sort_nonatomic(keys_sorted.data, keys_sorted.nr,
-                      sizeof(keys_sorted.data[0]),
-                      journal_sort_seq_cmp, NULL);
-
-       darray_for_each(keys_sorted, kp) {
-               cond_resched();
-
-               struct journal_key *k = *kp;
-
-               if (k->journal_seq)
-                       replay_now_at(j, k->journal_seq);
-               else
-                       replay_now_at(j, j->replay_journal_seq_end);
-
-               ret = commit_do(trans, NULL, NULL,
-                               BCH_TRANS_COMMIT_no_enospc|
-                               BCH_TRANS_COMMIT_skip_accounting_apply|
-                               (!k->allocated
-                                ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
-                                : 0),
-                            bch2_journal_replay_key(trans, k));
-               if (ret) {
-                       struct printbuf buf = PRINTBUF;
-                       bch2_btree_id_level_to_text(&buf, k->btree_id, k->level);
-                       bch_err_msg(c, ret, "while replaying key at %s:", buf.buf);
-                       printbuf_exit(&buf);
-                       goto err;
-               }
-
-               BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
-       }
-
-       /*
-        * We need to put our btree_trans before calling flush_all_pins(), since
-        * that will use a btree_trans internally
-        */
-       bch2_trans_put(trans);
-       trans = NULL;
-
-       if (!c->opts.retain_recovery_info &&
-           c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay)
-               bch2_journal_keys_put_initial(c);
-
-       replay_now_at(j, j->replay_journal_seq_end);
-       j->replay_journal_seq = 0;
-
-       bch2_journal_set_replay_done(j);
-
-       /* if we did any repair, flush it immediately */
-       if (immediate_flush) {
-               bch2_journal_flush_all_pins(&c->journal);
-               ret = bch2_journal_meta(&c->journal);
-       }
-
-       if (keys->nr)
-               bch2_journal_log_msg(c, "journal replay finished");
-err:
-       if (trans)
-               bch2_trans_put(trans);
-       darray_exit(&keys_sorted);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-/* journal replay early: */
-
-static int journal_replay_entry_early(struct bch_fs *c,
-                                     struct jset_entry *entry)
-{
-       int ret = 0;
-
-       switch (entry->type) {
-       case BCH_JSET_ENTRY_btree_root: {
-
-               if (unlikely(!entry->u64s))
-                       return 0;
-
-               if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
-                               c, invalid_btree_id,
-                               "invalid btree id %u (max %u)",
-                               entry->btree_id, BTREE_ID_NR_MAX))
-                       return 0;
-
-               while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
-                       ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
-                       if (ret)
-                               return ret;
-               }
-
-               struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
-
-               r->level = entry->level;
-               bkey_copy(&r->key, (struct bkey_i *) entry->start);
-               r->error = 0;
-               r->alive = true;
-               break;
-       }
-       case BCH_JSET_ENTRY_usage: {
-               struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
-
-               switch (entry->btree_id) {
-               case BCH_FS_USAGE_key_version:
-                       atomic64_set(&c->key_version, le64_to_cpu(u->v));
-                       break;
-               }
-               break;
-       }
-       case BCH_JSET_ENTRY_blacklist: {
-               struct jset_entry_blacklist *bl_entry =
-                       container_of(entry, struct jset_entry_blacklist, entry);
-
-               ret = bch2_journal_seq_blacklist_add(c,
-                               le64_to_cpu(bl_entry->seq),
-                               le64_to_cpu(bl_entry->seq) + 1);
-               break;
-       }
-       case BCH_JSET_ENTRY_blacklist_v2: {
-               struct jset_entry_blacklist_v2 *bl_entry =
-                       container_of(entry, struct jset_entry_blacklist_v2, entry);
-
-               ret = bch2_journal_seq_blacklist_add(c,
-                               le64_to_cpu(bl_entry->start),
-                               le64_to_cpu(bl_entry->end) + 1);
-               break;
-       }
-       case BCH_JSET_ENTRY_clock: {
-               struct jset_entry_clock *clock =
-                       container_of(entry, struct jset_entry_clock, entry);
-
-               atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
-       }
-       }
-fsck_err:
-       return ret;
-}
-
-static int journal_replay_early(struct bch_fs *c,
-                               struct bch_sb_field_clean *clean)
-{
-       if (clean) {
-               for (struct jset_entry *entry = clean->start;
-                    entry != vstruct_end(&clean->field);
-                    entry = vstruct_next(entry)) {
-                       int ret = journal_replay_entry_early(c, entry);
-                       if (ret)
-                               return ret;
-               }
-       } else {
-               struct genradix_iter iter;
-               struct journal_replay *i, **_i;
-
-               genradix_for_each(&c->journal_entries, iter, _i) {
-                       i = *_i;
-
-                       if (journal_replay_ignore(i))
-                               continue;
-
-                       vstruct_for_each(&i->j, entry) {
-                               int ret = journal_replay_entry_early(c, entry);
-                               if (ret)
-                                       return ret;
-                       }
-               }
-       }
-
-       return 0;
-}
-
-/* sb clean section: */
-
-static int read_btree_roots(struct bch_fs *c)
-{
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
-               struct btree_root *r = bch2_btree_id_root(c, i);
-
-               if (!r->alive)
-                       continue;
-
-               printbuf_reset(&buf);
-               bch2_btree_id_level_to_text(&buf, i, r->level);
-
-               if (mustfix_fsck_err_on((ret = r->error),
-                                       c, btree_root_bkey_invalid,
-                                       "invalid btree root %s",
-                                       buf.buf) ||
-                   mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
-                                       c, btree_root_read_error,
-                                       "error reading btree root %s: %s",
-                                       buf.buf, bch2_err_str(ret))) {
-                       if (btree_id_is_alloc(i))
-                               r->error = 0;
-                       ret = 0;
-               }
-       }
-
-       for (unsigned i = 0; i < BTREE_ID_NR; i++) {
-               struct btree_root *r = bch2_btree_id_root(c, i);
-
-               if (!r->b && !r->error) {
-                       r->alive = false;
-                       r->level = 0;
-                       bch2_btree_root_alloc_fake(c, i, 0);
-               }
-       }
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static bool check_version_upgrade(struct bch_fs *c)
-{
-       unsigned latest_version = bcachefs_metadata_version_current;
-       unsigned latest_compatible = min(latest_version,
-                                        bch2_latest_compatible_version(c->sb.version));
-       unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
-       unsigned new_version = 0;
-       bool ret = false;
-
-       if (old_version < bcachefs_metadata_required_upgrade_below) {
-               if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
-                   latest_compatible < bcachefs_metadata_required_upgrade_below)
-                       new_version = latest_version;
-               else
-                       new_version = latest_compatible;
-       } else {
-               switch (c->opts.version_upgrade) {
-               case BCH_VERSION_UPGRADE_compatible:
-                       new_version = latest_compatible;
-                       break;
-               case BCH_VERSION_UPGRADE_incompatible:
-                       new_version = latest_version;
-                       break;
-               case BCH_VERSION_UPGRADE_none:
-                       new_version = min(old_version, latest_version);
-                       break;
-               }
-       }
-
-       if (new_version > old_version) {
-               struct printbuf buf = PRINTBUF;
-
-               if (old_version < bcachefs_metadata_required_upgrade_below)
-                       prt_str(&buf, "Version upgrade required:\n");
-
-               if (old_version != c->sb.version) {
-                       prt_str(&buf, "Version upgrade from ");
-                       bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
-                       prt_str(&buf, " to ");
-                       bch2_version_to_text(&buf, c->sb.version);
-                       prt_str(&buf, " incomplete\n");
-               }
-
-               prt_printf(&buf, "Doing %s version upgrade from ",
-                          BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
-                          ? "incompatible" : "compatible");
-               bch2_version_to_text(&buf, old_version);
-               prt_str(&buf, " to ");
-               bch2_version_to_text(&buf, new_version);
-               prt_newline(&buf);
-
-               struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-               __le64 passes = ext->recovery_passes_required[0];
-               bch2_sb_set_upgrade(c, old_version, new_version);
-               passes = ext->recovery_passes_required[0] & ~passes;
-
-               if (passes) {
-                       prt_str(&buf, "  running recovery passes: ");
-                       prt_bitflags(&buf, bch2_recovery_passes,
-                                    bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
-               }
-
-               bch_notice(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-
-               ret = true;
-       }
-
-       if (new_version > c->sb.version_incompat_allowed &&
-           c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
-               struct printbuf buf = PRINTBUF;
-
-               prt_str(&buf, "Now allowing incompatible features up to ");
-               bch2_version_to_text(&buf, new_version);
-               prt_str(&buf, ", previously allowed up to ");
-               bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
-               prt_newline(&buf);
-
-               bch_notice(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-
-               ret = true;
-       }
-
-       if (ret)
-               bch2_sb_upgrade(c, new_version,
-                               c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible);
-
-       return ret;
-}
-
-int bch2_fs_recovery(struct bch_fs *c)
-{
-       struct bch_sb_field_clean *clean = NULL;
-       struct jset *last_journal_entry = NULL;
-       u64 last_seq = 0, blacklist_seq, journal_seq;
-       int ret = 0;
-
-       if (c->sb.clean) {
-               clean = bch2_read_superblock_clean(c);
-               ret = PTR_ERR_OR_ZERO(clean);
-               if (ret)
-                       goto err;
-
-               bch_info(c, "recovering from clean shutdown, journal seq %llu",
-                        le64_to_cpu(clean->journal_seq));
-       } else {
-               bch_info(c, "recovering from unclean shutdown");
-       }
-
-       if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
-               bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
-               ret = -EINVAL;
-               goto err;
-       }
-
-       if (!c->sb.clean &&
-           !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
-               bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
-               ret = -EINVAL;
-               goto err;
-       }
-
-       if (c->opts.norecovery) {
-               c->opts.recovery_pass_last = c->opts.recovery_pass_last
-                       ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
-                       : BCH_RECOVERY_PASS_snapshots_read;
-               c->opts.nochanges = true;
-       }
-
-       if (c->opts.nochanges)
-               c->opts.read_only = true;
-
-       if (c->opts.journal_rewind) {
-               bch_info(c, "rewinding journal, fsck required");
-               c->opts.fsck = true;
-       }
-
-       if (go_rw_in_recovery(c)) {
-               /*
-                * start workqueues/kworkers early - kthread creation checks for
-                * pending signals, which is _very_ annoying
-                */
-               ret = bch2_fs_init_rw(c);
-               if (ret)
-                       goto err;
-       }
-
-       mutex_lock(&c->sb_lock);
-       struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-       bool write_sb = false;
-
-       if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
-               ext->recovery_passes_required[0] |=
-                       cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
-               write_sb = true;
-       }
-
-       u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-       if (sb_passes) {
-               struct printbuf buf = PRINTBUF;
-               prt_str(&buf, "superblock requires following recovery passes to be run:\n  ");
-               prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
-               bch_info(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-       }
-
-       if (bch2_check_version_downgrade(c)) {
-               struct printbuf buf = PRINTBUF;
-
-               prt_str(&buf, "Version downgrade required:");
-
-               __le64 passes = ext->recovery_passes_required[0];
-               bch2_sb_set_downgrade(c,
-                                     BCH_VERSION_MINOR(bcachefs_metadata_version_current),
-                                     BCH_VERSION_MINOR(c->sb.version));
-               passes = ext->recovery_passes_required[0] & ~passes;
-               if (passes) {
-                       prt_str(&buf, "\n  running recovery passes: ");
-                       prt_bitflags(&buf, bch2_recovery_passes,
-                                    bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
-               }
-
-               bch_info(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-               write_sb = true;
-       }
-
-       if (check_version_upgrade(c))
-               write_sb = true;
-
-       c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
-       if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) {
-               SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
-               write_sb = true;
-       }
-
-       if (write_sb)
-               bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       if (c->sb.clean)
-               set_bit(BCH_FS_clean_recovery, &c->flags);
-       if (c->opts.fsck)
-               set_bit(BCH_FS_in_fsck, &c->flags);
-       set_bit(BCH_FS_in_recovery, &c->flags);
-
-       ret = bch2_blacklist_table_initialize(c);
-       if (ret) {
-               bch_err(c, "error initializing blacklist table");
-               goto err;
-       }
-
-       bch2_journal_pos_from_member_info_resume(c);
-
-       if (!c->sb.clean || c->opts.retain_recovery_info) {
-               struct genradix_iter iter;
-               struct journal_replay **i;
-
-               bch_verbose(c, "starting journal read");
-               ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
-               if (ret)
-                       goto err;
-
-               /*
-                * note: cmd_list_journal needs the blacklist table fully up to date so
-                * it can asterisk ignored journal entries:
-                */
-               if (c->opts.read_journal_only)
-                       goto out;
-
-               genradix_for_each_reverse(&c->journal_entries, iter, i)
-                       if (!journal_replay_ignore(*i)) {
-                               last_journal_entry = &(*i)->j;
-                               break;
-                       }
-
-               if (mustfix_fsck_err_on(c->sb.clean &&
-                                       last_journal_entry &&
-                                       !journal_entry_empty(last_journal_entry), c,
-                               clean_but_journal_not_empty,
-                               "filesystem marked clean but journal not empty")) {
-                       c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-                       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-                       c->sb.clean = false;
-               }
-
-               if (!last_journal_entry) {
-                       fsck_err_on(!c->sb.clean, c,
-                                   dirty_but_no_journal_entries,
-                                   "no journal entries found");
-                       if (clean)
-                               goto use_clean;
-
-                       genradix_for_each_reverse(&c->journal_entries, iter, i)
-                               if (*i) {
-                                       last_journal_entry = &(*i)->j;
-                                       (*i)->ignore_blacklisted = false;
-                                       (*i)->ignore_not_dirty= false;
-                                       /*
-                                        * This was probably a NO_FLUSH entry,
-                                        * so last_seq was garbage - but we know
-                                        * we're only using a single journal
-                                        * entry, set it here:
-                                        */
-                                       (*i)->j.last_seq = (*i)->j.seq;
-                                       break;
-                               }
-               }
-
-               ret = bch2_journal_keys_sort(c);
-               if (ret)
-                       goto err;
-
-               if (c->sb.clean && last_journal_entry) {
-                       ret = bch2_verify_superblock_clean(c, &clean,
-                                                     last_journal_entry);
-                       if (ret)
-                               goto err;
-               }
-       } else {
-use_clean:
-               if (!clean) {
-                       bch_err(c, "no superblock clean section found");
-                       ret = bch_err_throw(c, fsck_repair_impossible);
-                       goto err;
-
-               }
-               blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
-       }
-
-       c->journal_replay_seq_start     = last_seq;
-       c->journal_replay_seq_end       = blacklist_seq - 1;
-
-       zero_out_btree_mem_ptr(&c->journal_keys);
-
-       ret = journal_replay_early(c, clean);
-       if (ret)
-               goto err;
-
-       ret = bch2_fs_resize_on_mount(c);
-       if (ret) {
-               up_write(&c->state_lock);
-               goto err;
-       }
-
-       if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
-               bch_info(c, "filesystem is an unresized image file, mounting ro");
-               c->opts.read_only = true;
-       }
-
-       if (!c->opts.read_only &&
-           (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) {
-               bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
-
-               bch2_reconstruct_alloc(c);
-       } else if (c->opts.reconstruct_alloc) {
-               bch2_journal_log_msg(c, "dropping alloc info");
-               bch_info(c, "dropping and reconstructing all alloc info");
-
-               bch2_reconstruct_alloc(c);
-       }
-
-       if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
-               /* We can't go RW to fix errors without alloc info */
-               if (c->opts.fix_errors == FSCK_FIX_yes ||
-                   c->opts.fix_errors == FSCK_FIX_ask)
-                       c->opts.fix_errors = FSCK_FIX_no;
-               if (c->opts.errors == BCH_ON_ERROR_fix_safe)
-                       c->opts.errors = BCH_ON_ERROR_continue;
-       }
-
-       /*
-        * After an unclean shutdown, skip then next few journal sequence
-        * numbers as they may have been referenced by btree writes that
-        * happened before their corresponding journal writes - those btree
-        * writes need to be ignored, by skipping and blacklisting the next few
-        * journal sequence numbers:
-        */
-       if (!c->sb.clean)
-               journal_seq += JOURNAL_BUF_NR * 4;
-
-       if (blacklist_seq != journal_seq) {
-               ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
-                                            blacklist_seq, journal_seq) ?:
-                       bch2_journal_seq_blacklist_add(c,
-                                       blacklist_seq, journal_seq);
-               if (ret) {
-                       bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
-                       goto err;
-               }
-       }
-
-       ret =   bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
-                                    journal_seq, last_seq, blacklist_seq - 1) ?:
-               bch2_fs_journal_start(&c->journal, last_seq, journal_seq);
-       if (ret)
-               goto err;
-
-       /*
-        * Skip past versions that might have possibly been used (as nonces),
-        * but hadn't had their pointers written:
-        */
-       if (c->sb.encryption_type && !c->sb.clean)
-               atomic64_add(1 << 16, &c->key_version);
-
-       ret = read_btree_roots(c);
-       if (ret)
-               goto err;
-
-       set_bit(BCH_FS_btree_running, &c->flags);
-
-       ret = bch2_sb_set_upgrade_extra(c);
-       if (ret)
-               goto err;
-
-       ret = bch2_run_recovery_passes(c, 0);
-       if (ret)
-               goto err;
-
-       /*
-        * Normally set by the appropriate recovery pass: when cleared, this
-        * indicates we're in early recovery and btree updates should be done by
-        * being applied to the journal replay keys. _Must_ be cleared before
-        * multithreaded use:
-        */
-       set_bit(BCH_FS_may_go_rw, &c->flags);
-       clear_bit(BCH_FS_in_fsck, &c->flags);
-
-       /* in case we don't run journal replay, i.e. norecovery mode */
-       set_bit(BCH_FS_accounting_replay_done, &c->flags);
-
-       bch2_async_btree_node_rewrites_flush(c);
-
-       /* fsync if we fixed errors */
-       if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
-               bch2_journal_flush_all_pins(&c->journal);
-               bch2_journal_meta(&c->journal);
-       }
-
-       /* If we fixed errors, verify that fs is actually clean now: */
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-           test_bit(BCH_FS_errors_fixed, &c->flags) &&
-           !test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
-           !test_bit(BCH_FS_error, &c->flags)) {
-               bch2_flush_fsck_errs(c);
-
-               bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
-               clear_bit(BCH_FS_errors_fixed, &c->flags);
-
-               ret = bch2_run_recovery_passes(c,
-                       BCH_RECOVERY_PASS_check_alloc_info);
-               if (ret)
-                       goto err;
-
-               if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
-                   test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
-                       bch_err(c, "Second fsck run was not clean");
-                       set_bit(BCH_FS_errors_not_fixed, &c->flags);
-               }
-
-               set_bit(BCH_FS_errors_fixed, &c->flags);
-       }
-
-       if (enabled_qtypes(c)) {
-               bch_verbose(c, "reading quotas");
-               ret = bch2_fs_quota_read(c);
-               if (ret)
-                       goto err;
-               bch_verbose(c, "quotas done");
-       }
-
-       mutex_lock(&c->sb_lock);
-       ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-       write_sb = false;
-
-       if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
-               SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
-               write_sb = true;
-       }
-
-       if (!test_bit(BCH_FS_error, &c->flags) &&
-           !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
-               c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
-               write_sb = true;
-       }
-
-       if (!test_bit(BCH_FS_error, &c->flags) &&
-           !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
-               memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
-               write_sb = true;
-       }
-
-       if (c->opts.fsck &&
-           !test_bit(BCH_FS_error, &c->flags) &&
-           c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 &&
-           ext->btrees_lost_data) {
-               ext->btrees_lost_data = 0;
-               write_sb = true;
-       }
-
-       if (c->opts.fsck &&
-           !test_bit(BCH_FS_error, &c->flags) &&
-           !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
-               SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
-               SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
-               write_sb = true;
-       }
-
-       if (bch2_blacklist_entries_gc(c))
-               write_sb = true;
-
-       if (write_sb)
-               bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
-           c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
-               struct bch_move_stats stats;
-
-               bch2_move_stats_init(&stats, "recovery");
-
-               struct printbuf buf = PRINTBUF;
-               bch2_version_to_text(&buf, c->sb.version_min);
-               bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
-               printbuf_exit(&buf);
-
-               ret =   bch2_fs_read_write_early(c) ?:
-                       bch2_scan_old_btree_nodes(c, &stats);
-               if (ret)
-                       goto err;
-               bch_info(c, "scanning for old btree nodes done");
-       }
-
-       ret = 0;
-out:
-       bch2_flush_fsck_errs(c);
-
-       if (!ret &&
-           test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
-           !c->opts.nochanges) {
-               bch2_fs_read_write_early(c);
-               bch2_delete_dead_snapshots_async(c);
-       }
-
-       bch_err_fn(c, ret);
-final_out:
-       if (!IS_ERR(clean))
-               kfree(clean);
-       return ret;
-err:
-fsck_err:
-       {
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-
-               prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret));
-               bch2_fs_emergency_read_only2(c, &buf);
-
-               bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-       }
-       goto final_out;
-}
-
-int bch2_fs_initialize(struct bch_fs *c)
-{
-       struct bch_inode_unpacked root_inode, lostfound_inode;
-       struct bkey_inode_buf packed_inode;
-       struct qstr lostfound = QSTR("lost+found");
-       struct bch_member *m;
-       int ret;
-
-       bch_notice(c, "initializing new filesystem");
-       set_bit(BCH_FS_new_fs, &c->flags);
-
-       mutex_lock(&c->sb_lock);
-       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
-       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
-
-       bch2_check_version_downgrade(c);
-
-       if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
-               bch2_sb_upgrade(c, bcachefs_metadata_version_current, false);
-               SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
-               bch2_write_super(c);
-       }
-
-       for_each_member_device(c, ca) {
-               m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-               SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false);
-               ca->mi = bch2_mi_to_cpu(m);
-       }
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       set_bit(BCH_FS_btree_running, &c->flags);
-       set_bit(BCH_FS_may_go_rw, &c->flags);
-
-       for (unsigned i = 0; i < BTREE_ID_NR; i++)
-               bch2_btree_root_alloc_fake(c, i, 0);
-
-       ret = bch2_fs_journal_alloc(c);
-       if (ret)
-               goto err;
-
-       /*
-        * journal_res_get() will crash if called before this has
-        * set up the journal.pin FIFO and journal.cur pointer:
-        */
-       ret = bch2_fs_journal_start(&c->journal, 1, 1);
-       if (ret)
-               goto err;
-
-       ret = bch2_fs_read_write_early(c);
-       if (ret)
-               goto err;
-
-       set_bit(BCH_FS_accounting_replay_done, &c->flags);
-       bch2_journal_set_replay_done(&c->journal);
-
-       for_each_member_device(c, ca) {
-               ret = bch2_dev_usage_init(ca, false);
-               if (ret) {
-                       bch2_dev_put(ca);
-                       goto err;
-               }
-       }
-
-       /*
-        * Write out the superblock and journal buckets, now that we can do
-        * btree updates
-        */
-       bch_verbose(c, "marking superblocks");
-       ret = bch2_trans_mark_dev_sbs(c);
-       bch_err_msg(c, ret, "marking superblocks");
-       if (ret)
-               goto err;
-
-       ret = bch2_fs_freespace_init(c);
-       if (ret)
-               goto err;
-
-       ret = bch2_initialize_subvolumes(c);
-       if (ret)
-               goto err;
-
-       bch_verbose(c, "reading snapshots table");
-       ret = bch2_snapshots_read(c);
-       if (ret)
-               goto err;
-       bch_verbose(c, "reading snapshots done");
-
-       bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
-       root_inode.bi_inum      = BCACHEFS_ROOT_INO;
-       root_inode.bi_subvol    = BCACHEFS_ROOT_SUBVOL;
-       bch2_inode_pack(&packed_inode, &root_inode);
-       packed_inode.inode.k.p.snapshot = U32_MAX;
-
-       ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0);
-       bch_err_msg(c, ret, "creating root directory");
-       if (ret)
-               goto err;
-
-       bch2_inode_init_early(c, &lostfound_inode);
-
-       ret = bch2_trans_commit_do(c, NULL, NULL, 0,
-               bch2_create_trans(trans,
-                                 BCACHEFS_ROOT_SUBVOL_INUM,
-                                 &root_inode, &lostfound_inode,
-                                 &lostfound,
-                                 0, 0, S_IFDIR|0700, 0,
-                                 NULL, NULL, (subvol_inum) { 0 }, 0));
-       bch_err_msg(c, ret, "creating lost+found");
-       if (ret)
-               goto err;
-
-       c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1;
-
-       bch2_copygc_wakeup(c);
-       bch2_rebalance_wakeup(c);
-
-       if (enabled_qtypes(c)) {
-               ret = bch2_fs_quota_read(c);
-               if (ret)
-                       goto err;
-       }
-
-       ret = bch2_journal_flush(&c->journal);
-       bch_err_msg(c, ret, "writing first journal entry");
-       if (ret)
-               goto err;
-
-       mutex_lock(&c->sb_lock);
-       SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
-       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       c->recovery.curr_pass = BCH_RECOVERY_PASS_NR;
-       return 0;
-err:
-       bch_err_fn(c, ret);
-       return ret;
-}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
deleted file mode 100644 (file)
index c023f52..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_H
-#define _BCACHEFS_RECOVERY_H
-
-int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id);
-void bch2_reconstruct_alloc(struct bch_fs *);
-
-int bch2_journal_replay(struct bch_fs *);
-
-int bch2_fs_recovery(struct bch_fs *);
-int bch2_fs_initialize(struct bch_fs *);
-
-#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
deleted file mode 100644 (file)
index 6a039e0..0000000
+++ /dev/null
@@ -1,646 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "btree_gc.h"
-#include "btree_node_scan.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal.h"
-#include "lru.h"
-#include "logged_ops.h"
-#include "movinggc.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-
-const char * const bch2_recovery_passes[] = {
-#define x(_fn, ...)    #_fn,
-       BCH_RECOVERY_PASSES()
-#undef x
-       NULL
-};
-
-static const u8 passes_to_stable_map[] = {
-#define x(n, id, ...)  [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
-       BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static const u8 passes_from_stable_map[] = {
-#define x(n, id, ...)  [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
-       BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
-{
-       return passes_to_stable_map[pass];
-}
-
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
-       u64 ret = 0;
-       for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
-               if (v & BIT_ULL(i))
-                       ret |= BIT_ULL(passes_to_stable_map[i]);
-       return ret;
-}
-
-static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass)
-{
-       return pass < ARRAY_SIZE(passes_from_stable_map)
-               ? passes_from_stable_map[pass]
-               : 0;
-}
-
-u64 bch2_recovery_passes_from_stable(u64 v)
-{
-       u64 ret = 0;
-       for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++)
-               if (v & BIT_ULL(i))
-                       ret |= BIT_ULL(passes_from_stable_map[i]);
-       return ret;
-}
-
-static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                           enum bch_validate_flags flags, struct printbuf *err)
-{
-       return 0;
-}
-
-static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
-                                           struct bch_sb *sb,
-                                           struct bch_sb_field *f)
-{
-       struct bch_sb_field_recovery_passes *r =
-               field_to_type(f, recovery_passes);
-       unsigned nr = recovery_passes_nr_entries(r);
-
-       if (out->nr_tabstops < 1)
-               printbuf_tabstop_push(out, 32);
-       if (out->nr_tabstops < 2)
-               printbuf_tabstop_push(out, 16);
-
-       prt_printf(out, "Pass\tLast run\tLast runtime\n");
-
-       for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) {
-               if (!i->last_run)
-                       continue;
-
-               unsigned idx = i - r->start;
-
-               prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]);
-
-               bch2_prt_datetime(out, le64_to_cpu(i->last_run));
-               prt_tab(out);
-
-               bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);
-
-               if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
-                       prt_str(out, " (no ratelimit)");
-
-               prt_newline(out);
-       }
-}
-
-static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c,
-                                                              enum bch_recovery_pass pass)
-{
-       enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
-
-       lockdep_assert_held(&c->sb_lock);
-
-       struct bch_sb_field_recovery_passes *r =
-               bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
-
-       if (stable >= recovery_passes_nr_entries(r)) {
-               unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64);
-
-               r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
-               if (!r) {
-                       bch_err(c, "error creating recovery_passes sb section");
-                       return NULL;
-               }
-       }
-
-       return r->start + stable;
-}
-
-static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
-                                          enum bch_recovery_pass pass,
-                                          s64 start_time)
-{
-       guard(mutex)(&c->sb_lock);
-       struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-       __clear_bit_le64(bch2_recovery_pass_to_stable(pass),
-                        ext->recovery_passes_required);
-
-       struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
-       if (e) {
-               s64 end_time    = ktime_get_real_seconds();
-               e->last_run     = cpu_to_le64(end_time);
-               e->last_runtime = cpu_to_le32(max(0, end_time - start_time));
-               SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
-       }
-
-       bch2_write_super(c);
-}
-
-void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
-                                        enum bch_recovery_pass pass)
-{
-       guard(mutex)(&c->sb_lock);
-
-       struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
-       if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) {
-               SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
-               bch2_write_super(c);
-       }
-}
-
-static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
-{
-       enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
-       bool ret = false;
-
-       lockdep_assert_held(&c->sb_lock);
-
-       struct bch_sb_field_recovery_passes *r =
-               bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
-
-       if (stable < recovery_passes_nr_entries(r)) {
-               struct recovery_pass_entry *i = r->start + stable;
-
-               /*
-                * Ratelimit if the last runtime was more than 1% of the time
-                * since we last ran
-                */
-               ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
-                       ktime_get_real_seconds() - le64_to_cpu(i->last_run);
-
-               if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
-                       ret = false;
-       }
-
-       return ret;
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = {
-       .validate       = bch2_sb_recovery_passes_validate,
-       .to_text        = bch2_sb_recovery_passes_to_text
-};
-
-/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
-static int bch2_recovery_pass_empty(struct bch_fs *c)
-{
-       return 0;
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
-       struct journal_keys *keys = &c->journal_keys;
-
-       /*
-        * After we go RW, the journal keys buffer can't be modified (except for
-        * setting journal_key->overwritten: it will be accessed by multiple
-        * threads
-        */
-       move_gap(keys, keys->nr);
-
-       set_bit(BCH_FS_may_go_rw, &c->flags);
-
-       if (go_rw_in_recovery(c)) {
-               if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
-                       bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
-                       bch2_reconstruct_alloc(c);
-               }
-
-               return bch2_fs_read_write_early(c);
-       }
-       return 0;
-}
-
-/*
- * Make sure root inode is readable while we're still in recovery and can rewind
- * for repair:
- */
-static int bch2_lookup_root_inode(struct bch_fs *c)
-{
-       subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM;
-       struct bch_inode_unpacked inode_u;
-       struct bch_subvolume subvol;
-
-       return bch2_trans_do(c,
-               bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
-               bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
-}
-
-struct recovery_pass_fn {
-       int             (*fn)(struct bch_fs *);
-       unsigned        when;
-};
-
-static struct recovery_pass_fn recovery_pass_fns[] = {
-#define x(_fn, _id, _when)     { .fn = bch2_##_fn, .when = _when },
-       BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static u64 bch2_recovery_passes_match(unsigned flags)
-{
-       u64 ret = 0;
-
-       for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
-               if (recovery_pass_fns[i].when & flags)
-                       ret |= BIT_ULL(i);
-       return ret;
-}
-
-u64 bch2_fsck_recovery_passes(void)
-{
-       return bch2_recovery_passes_match(PASS_FSCK);
-}
-
-static void bch2_run_async_recovery_passes(struct bch_fs *c)
-{
-       if (!down_trylock(&c->recovery.run_lock))
-               return;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes))
-               goto unlock;
-
-       if (queue_work(system_long_wq, &c->recovery.work))
-               return;
-
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
-unlock:
-       up(&c->recovery.run_lock);
-}
-
-static bool recovery_pass_needs_set(struct bch_fs *c,
-                                   enum bch_recovery_pass pass,
-                                   enum bch_run_recovery_pass_flags *flags)
-{
-       struct bch_fs_recovery *r = &c->recovery;
-
-       /*
-        * Never run scan_for_btree_nodes persistently: check_topology will run
-        * it if required
-        */
-       if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
-               *flags |= RUN_RECOVERY_PASS_nopersistent;
-
-       if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
-           !bch2_recovery_pass_want_ratelimit(c, pass))
-               *flags &= ~RUN_RECOVERY_PASS_ratelimit;
-
-       /*
-        * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do
-        * anything if the pass has already run: these mean we need a prior pass
-        * to run before we continue to repair, we don't expect that pass to fix
-        * the damage we encountered.
-        *
-        * Otherwise, we run run_explicit_recovery_pass when we find damage, so
-        * it should run again even if it's already run:
-        */
-       bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
-       bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent);
-       bool rewind = in_recovery &&
-               r->curr_pass > pass &&
-               !(r->passes_complete & BIT_ULL(pass));
-
-       if (persistent
-           ? !(c->sb.recovery_passes_required & BIT_ULL(pass))
-           : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass)))
-               return true;
-
-       if (!(*flags & RUN_RECOVERY_PASS_ratelimit) &&
-           (r->passes_ratelimiting & BIT_ULL(pass)))
-               return true;
-
-       if (rewind)
-               return true;
-
-       return false;
-}
-
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
-                                     struct printbuf *out,
-                                     enum bch_recovery_pass pass,
-                                     enum bch_run_recovery_pass_flags flags)
-{
-       struct bch_fs_recovery *r = &c->recovery;
-       int ret = 0;
-
-       lockdep_assert_held(&c->sb_lock);
-
-       bch2_printbuf_make_room(out, 1024);
-       out->atomic++;
-
-       unsigned long lockflags;
-       spin_lock_irqsave(&r->lock, lockflags);
-
-       if (!recovery_pass_needs_set(c, pass, &flags))
-               goto out;
-
-       bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
-       bool rewind = in_recovery &&
-               r->curr_pass > pass &&
-               !(r->passes_complete & BIT_ULL(pass));
-       bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;
-
-       if (!(flags & RUN_RECOVERY_PASS_nopersistent)) {
-               struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-               __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
-       }
-
-       if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
-           (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) {
-               prt_printf(out, "need recovery pass %s (%u), but already rw\n",
-                          bch2_recovery_passes[pass], pass);
-               ret = bch_err_throw(c, cannot_rewind_recovery);
-               goto out;
-       }
-
-       if (ratelimit)
-               r->passes_ratelimiting |= BIT_ULL(pass);
-       else
-               r->passes_ratelimiting &= ~BIT_ULL(pass);
-
-       if (in_recovery && !ratelimit) {
-               prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n",
-                          bch2_recovery_passes[pass], pass,
-                          bch2_recovery_passes[r->curr_pass], r->curr_pass,
-                          rewind ? " - rewinding" : "");
-
-               r->passes_to_run |= BIT_ULL(pass);
-
-               if (rewind) {
-                       r->next_pass = pass;
-                       r->passes_complete &= (1ULL << pass) >> 1;
-                       ret = bch_err_throw(c, restart_recovery);
-               }
-       } else {
-               prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
-                          bch2_recovery_passes[pass], pass,
-                          ratelimit ? " - ratelimiting" : "");
-
-               struct recovery_pass_fn *p = recovery_pass_fns + pass;
-               if (p->when & PASS_ONLINE)
-                       bch2_run_async_recovery_passes(c);
-       }
-out:
-       spin_unlock_irqrestore(&r->lock, lockflags);
-       --out->atomic;
-       return ret;
-}
-
-int bch2_run_explicit_recovery_pass(struct bch_fs *c,
-                                   struct printbuf *out,
-                                   enum bch_recovery_pass pass,
-                                   enum bch_run_recovery_pass_flags flags)
-{
-       int ret = 0;
-
-       if (recovery_pass_needs_set(c, pass, &flags)) {
-               guard(mutex)(&c->sb_lock);
-               ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
-               bch2_write_super(c);
-       }
-
-       return ret;
-}
-
-/*
- * Returns 0 if @pass has run recently, otherwise one of
- * -BCH_ERR_restart_recovery
- * -BCH_ERR_recovery_pass_will_run
- */
-int bch2_require_recovery_pass(struct bch_fs *c,
-                              struct printbuf *out,
-                              enum bch_recovery_pass pass)
-{
-       if (test_bit(BCH_FS_in_recovery, &c->flags) &&
-           c->recovery.passes_complete & BIT_ULL(pass))
-               return 0;
-
-       guard(mutex)(&c->sb_lock);
-
-       if (bch2_recovery_pass_want_ratelimit(c, pass))
-               return 0;
-
-       enum bch_run_recovery_pass_flags flags = 0;
-       int ret = 0;
-
-       if (recovery_pass_needs_set(c, pass, &flags)) {
-               ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
-               bch2_write_super(c);
-       }
-
-       return ret ?: bch_err_throw(c, recovery_pass_will_run);
-}
-
-int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
-       enum bch_run_recovery_pass_flags flags = 0;
-
-       if (!recovery_pass_needs_set(c, pass, &flags))
-               return 0;
-
-       struct printbuf buf = PRINTBUF;
-       bch2_log_msg_start(c, &buf);
-
-       mutex_lock(&c->sb_lock);
-       int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass,
-                                               RUN_RECOVERY_PASS_nopersistent);
-       mutex_unlock(&c->sb_lock);
-
-       bch2_print_str(c, KERN_NOTICE, buf.buf);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
-       struct bch_fs_recovery *r = &c->recovery;
-       struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
-       if (!(p->when & PASS_SILENT))
-               bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
-                          bch2_recovery_passes[pass]);
-
-       s64 start_time = ktime_get_real_seconds();
-       int ret = p->fn(c);
-
-       r->passes_to_run &= ~BIT_ULL(pass);
-
-       if (ret) {
-               r->passes_failing |= BIT_ULL(pass);
-               return ret;
-       }
-
-       r->passes_failing = 0;
-
-       if (!test_bit(BCH_FS_error, &c->flags))
-               bch2_sb_recovery_pass_complete(c, pass, start_time);
-
-       if (!(p->when & PASS_SILENT))
-               bch2_print(c, KERN_CONT " done\n");
-
-       return 0;
-}
-
-static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run,
-                                     bool online)
-{
-       struct bch_fs_recovery *r = &c->recovery;
-       int ret = 0;
-
-       spin_lock_irq(&r->lock);
-
-       if (online)
-               orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE);
-
-       if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
-               orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC);
-
-       /*
-        * A failed recovery pass will be retried after another pass succeeds -
-        * but not this iteration.
-        *
-        * This is because some passes depend on repair done by other passes: we
-        * may want to retry, but we don't want to loop on failing passes.
-        */
-
-       orig_passes_to_run &= ~r->passes_failing;
-
-       r->passes_to_run = orig_passes_to_run;
-
-       while (r->passes_to_run) {
-               unsigned prev_done = r->pass_done;
-               unsigned pass = __ffs64(r->passes_to_run);
-               r->curr_pass = pass;
-               r->next_pass = r->curr_pass + 1;
-               r->passes_to_run &= ~BIT_ULL(pass);
-
-               spin_unlock_irq(&r->lock);
-
-               int ret2 = bch2_run_recovery_pass(c, pass) ?:
-                       bch2_journal_flush(&c->journal);
-
-               spin_lock_irq(&r->lock);
-
-               if (r->next_pass < r->curr_pass) {
-                       /* Rewind: */
-                       r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass);
-               } else if (!ret2) {
-                       r->pass_done = max(r->pass_done, pass);
-                       r->passes_complete |= BIT_ULL(pass);
-               } else {
-                       ret = ret2;
-               }
-
-               if (ret && !online)
-                       break;
-
-               if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
-                   r->pass_done > BCH_RECOVERY_PASS_check_snapshots) {
-                       bch2_copygc_wakeup(c);
-                       bch2_rebalance_wakeup(c);
-               }
-       }
-
-       clear_bit(BCH_FS_in_recovery, &c->flags);
-       spin_unlock_irq(&r->lock);
-
-       return ret;
-}
-
-static void bch2_async_recovery_passes_work(struct work_struct *work)
-{
-       struct bch_fs *c = container_of(work, struct bch_fs, recovery.work);
-       struct bch_fs_recovery *r = &c->recovery;
-
-       __bch2_run_recovery_passes(c,
-               c->sb.recovery_passes_required & ~r->passes_ratelimiting,
-               true);
-
-       up(&r->run_lock);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes)
-{
-       return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true);
-}
-
-int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from)
-{
-       u64 passes =
-               bch2_recovery_passes_match(PASS_ALWAYS) |
-               (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) |
-               (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) |
-               c->opts.recovery_passes |
-               c->sb.recovery_passes_required;
-
-       if (c->opts.recovery_pass_last)
-               passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1;
-
-       /*
-        * We can't allow set_may_go_rw to be excluded; that would cause us to
-        * use the journal replay keys for updates where it's not expected.
-        */
-       c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
-       passes &= ~c->opts.recovery_passes_exclude;
-
-       passes &= ~(BIT_ULL(from) - 1);
-
-       down(&c->recovery.run_lock);
-       int ret = __bch2_run_recovery_passes(c, passes, false);
-       up(&c->recovery.run_lock);
-
-       return ret;
-}
-
-static void prt_passes(struct printbuf *out, const char *msg, u64 passes)
-{
-       prt_printf(out, "%s:\t", msg);
-       prt_bitflags(out, bch2_recovery_passes, passes);
-       prt_newline(out);
-}
-
-void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       struct bch_fs_recovery *r = &c->recovery;
-
-       printbuf_tabstop_push(out, 32);
-       prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required);
-       prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required &
-                  bch2_recovery_passes_match(PASS_ONLINE));
-       prt_passes(out, "Complete passes", r->passes_complete);
-       prt_passes(out, "Failing passes", r->passes_failing);
-
-       if (r->curr_pass) {
-               prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]);
-               prt_passes(out, "Current passes", r->passes_to_run);
-       }
-}
-
-void bch2_fs_recovery_passes_init(struct bch_fs *c)
-{
-       spin_lock_init(&c->recovery.lock);
-       sema_init(&c->recovery.run_lock, 1);
-
-       INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work);
-}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
deleted file mode 100644 (file)
index 2117f0c..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _BCACHEFS_RECOVERY_PASSES_H
-#define _BCACHEFS_RECOVERY_PASSES_H
-
-extern const char * const bch2_recovery_passes[];
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes;
-
-u64 bch2_recovery_passes_to_stable(u64 v);
-u64 bch2_recovery_passes_from_stable(u64 v);
-
-u64 bch2_fsck_recovery_passes(void);
-
-void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass);
-
-enum bch_run_recovery_pass_flags {
-       RUN_RECOVERY_PASS_nopersistent  = BIT(0),
-       RUN_RECOVERY_PASS_ratelimit     = BIT(1),
-};
-
-static inline bool go_rw_in_recovery(struct bch_fs *c)
-{
-       return (c->journal_keys.nr ||
-               !c->opts.read_only ||
-               !c->sb.clean ||
-               c->opts.recovery_passes ||
-               (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))));
-}
-
-int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
-
-int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
-                                     enum bch_recovery_pass,
-                                     enum bch_run_recovery_pass_flags);
-int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
-                                   enum bch_recovery_pass,
-                                   enum bch_run_recovery_pass_flags);
-
-int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *,
-                              enum bch_recovery_pass);
-
-int bch2_run_online_recovery_passes(struct bch_fs *, u64);
-int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass);
-
-void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_recovery_passes_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
deleted file mode 100644 (file)
index b63c205..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H
-#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H
-
-#define PASS_SILENT            BIT(0)
-#define PASS_FSCK              BIT(1)
-#define PASS_UNCLEAN           BIT(2)
-#define PASS_ALWAYS            BIT(3)
-#define PASS_ONLINE            BIT(4)
-#define PASS_ALLOC             BIT(5)
-#define PASS_FSCK_ALLOC                (PASS_FSCK|PASS_ALLOC)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define PASS_FSCK_DEBUG                BIT(1)
-#else
-#define PASS_FSCK_DEBUG                0
-#endif
-
-/*
- * Passes may be reordered, but the second field is a persistent identifier and
- * must never change:
- */
-#define BCH_RECOVERY_PASSES()                                                          \
-       x(recovery_pass_empty,                  41, PASS_SILENT)                        \
-       x(scan_for_btree_nodes,                 37, 0)                                  \
-       x(check_topology,                        4, 0)                                  \
-       x(accounting_read,                      39, PASS_ALWAYS)                        \
-       x(alloc_read,                            0, PASS_ALWAYS)                        \
-       x(stripes_read,                          1, 0)                                  \
-       x(initialize_subvolumes,                 2, 0)                                  \
-       x(snapshots_read,                        3, PASS_ALWAYS)                        \
-       x(check_allocations,                     5, PASS_FSCK_ALLOC)                    \
-       x(trans_mark_dev_sbs,                    6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \
-       x(fs_journal_alloc,                      7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \
-       x(set_may_go_rw,                         8, PASS_ALWAYS|PASS_SILENT)            \
-       x(journal_replay,                        9, PASS_ALWAYS)                        \
-       x(check_alloc_info,                     10, PASS_ONLINE|PASS_FSCK_ALLOC)        \
-       x(check_lrus,                           11, PASS_ONLINE|PASS_FSCK_ALLOC)        \
-       x(check_btree_backpointers,             12, PASS_ONLINE|PASS_FSCK_ALLOC)        \
-       x(check_backpointers_to_extents,        13, PASS_ONLINE|PASS_FSCK_DEBUG)        \
-       x(check_extents_to_backpointers,        14, PASS_ONLINE|PASS_FSCK_ALLOC)        \
-       x(check_alloc_to_lru_refs,              15, PASS_ONLINE|PASS_FSCK_ALLOC)        \
-       x(fs_freespace_init,                    16, PASS_ALWAYS|PASS_SILENT)            \
-       x(bucket_gens_init,                     17, 0)                                  \
-       x(reconstruct_snapshots,                38, 0)                                  \
-       x(check_snapshot_trees,                 18, PASS_ONLINE|PASS_FSCK)              \
-       x(check_snapshots,                      19, PASS_ONLINE|PASS_FSCK)              \
-       x(check_subvols,                        20, PASS_ONLINE|PASS_FSCK)              \
-       x(check_subvol_children,                35, PASS_ONLINE|PASS_FSCK)              \
-       x(delete_dead_snapshots,                21, PASS_ONLINE|PASS_FSCK)              \
-       x(fs_upgrade_for_subvolumes,            22, 0)                                  \
-       x(check_inodes,                         24, PASS_FSCK)                          \
-       x(check_extents,                        25, PASS_FSCK)                          \
-       x(check_indirect_extents,               26, PASS_ONLINE|PASS_FSCK)              \
-       x(check_dirents,                        27, PASS_FSCK)                          \
-       x(check_xattrs,                         28, PASS_FSCK)                          \
-       x(check_root,                           29, PASS_ONLINE|PASS_FSCK)              \
-       x(check_unreachable_inodes,             40, PASS_FSCK)                          \
-       x(check_subvolume_structure,            36, PASS_ONLINE|PASS_FSCK)              \
-       x(check_directory_structure,            30, PASS_ONLINE|PASS_FSCK)              \
-       x(check_nlinks,                         31, PASS_FSCK)                          \
-       x(check_rebalance_work,                 43, PASS_ONLINE|PASS_FSCK)              \
-       x(resume_logged_ops,                    23, PASS_ALWAYS)                        \
-       x(delete_dead_inodes,                   32, PASS_ALWAYS)                        \
-       x(fix_reflink_p,                        33, 0)                                  \
-       x(set_fs_needs_rebalance,               34, 0)                                  \
-       x(lookup_root_inode,                    42, PASS_ALWAYS|PASS_SILENT)
-
-/* We normally enumerate recovery passes in the order we run them: */
-enum bch_recovery_pass {
-#define x(n, id, when) BCH_RECOVERY_PASS_##n,
-       BCH_RECOVERY_PASSES()
-#undef x
-       BCH_RECOVERY_PASS_NR
-};
-
-/* But we also need stable identifiers that can be used in the superblock */
-enum bch_recovery_pass_stable {
-#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
-       BCH_RECOVERY_PASSES()
-#undef x
-};
-
-struct recovery_pass_entry {
-       __le64                  last_run;
-       __le32                  last_runtime;
-       __le32                  flags;
-};
-
-LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT,   struct recovery_pass_entry, flags, 0, 1)
-
-struct bch_sb_field_recovery_passes {
-       struct bch_sb_field     field;
-       struct recovery_pass_entry start[];
-};
-
-static inline unsigned
-recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r)
-{
-       return r
-               ? ((vstruct_end(&r->field) - (void *) &r->start[0]) /
-                  sizeof(struct recovery_pass_entry))
-               : 0;
-}
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
deleted file mode 100644 (file)
index aa95269..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
-#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
-
-struct bch_fs_recovery {
-       /*
-        * Two different uses:
-        * "Has this fsck pass?" - i.e. should this type of error be an
-        * emergency read-only
-        * And, in certain situations fsck will rewind to an earlier pass: used
-        * for signaling to the toplevel code which pass we want to run now.
-        */
-       enum bch_recovery_pass  curr_pass;
-       enum bch_recovery_pass  next_pass;
-       /* never rewinds version of curr_pass */
-       enum bch_recovery_pass  pass_done;
-       u64                     passes_to_run;
-       /* bitmask of recovery passes that we actually ran */
-       u64                     passes_complete;
-       u64                     passes_failing;
-       u64                     passes_ratelimiting;
-       spinlock_t              lock;
-       struct semaphore        run_lock;
-       struct work_struct      work;
-};
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
deleted file mode 100644 (file)
index 92b90cf..0000000
+++ /dev/null
@@ -1,865 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "rebalance.h"
-#include "reflink.h"
-#include "subvolume.h"
-#include "super-io.h"
-
-#include <linux/sched/signal.h>
-
-static inline bool bkey_extent_is_reflink_data(const struct bkey *k)
-{
-       switch (k->type) {
-       case KEY_TYPE_reflink_v:
-       case KEY_TYPE_indirect_inline_data:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline unsigned bkey_type_to_indirect(const struct bkey *k)
-{
-       switch (k->type) {
-       case KEY_TYPE_extent:
-               return KEY_TYPE_reflink_v;
-       case KEY_TYPE_inline_data:
-               return KEY_TYPE_indirect_inline_data;
-       default:
-               return 0;
-       }
-}
-
-/* reflink pointers */
-
-int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
-                           struct bkey_validate_context from)
-{
-       struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-       int ret = 0;
-
-       bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad),
-                        c, reflink_p_front_pad_bad,
-                        "idx < front_pad (%llu < %u)",
-                        REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad));
-fsck_err:
-       return ret;
-}
-
-void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
-                           struct bkey_s_c k)
-{
-       struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-
-       prt_printf(out, "idx %llu front_pad %u back_pad %u",
-              REFLINK_P_IDX(p.v),
-              le32_to_cpu(p.v->front_pad),
-              le32_to_cpu(p.v->back_pad));
-
-       if (REFLINK_P_ERROR(p.v))
-               prt_str(out, " error");
-}
-
-bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
-       struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
-       struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
-
-       /*
-        * Disabled for now, the triggers code needs to be reworked for merging
-        * of reflink pointers to work:
-        */
-       return false;
-
-       if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v))
-               return false;
-
-       if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v))
-               return false;
-
-       bch2_key_resize(l.k, l.k->size + r.k->size);
-       return true;
-}
-
-/* indirect extents */
-
-int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
-                           struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)),
-                        c, reflink_v_pos_bad,
-                        "indirect extent above maximum position 0:%llu",
-                        REFLINK_P_IDX_MAX);
-
-       ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
-       return ret;
-}
-
-void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
-                           struct bkey_s_c k)
-{
-       struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
-       prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
-
-       bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-#if 0
-Currently disabled, needs to be debugged:
-
-bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
-       struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
-       struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
-
-       return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
-}
-#endif
-
-/* indirect inline data */
-
-int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
-                                      struct bkey_validate_context from)
-{
-       return 0;
-}
-
-void bch2_indirect_inline_data_to_text(struct printbuf *out,
-                                      struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
-       unsigned datalen = bkey_inline_data_bytes(k.k);
-
-       prt_printf(out, "refcount %llu datalen %u: %*phN",
-              le64_to_cpu(d.v->refcount), datalen,
-              min(datalen, 32U), d.v->data);
-}
-
-/* lookup */
-
-static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p,
-                                           bool should_commit)
-{
-       struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
-       int ret = PTR_ERR_OR_ZERO(new);
-       if (ret)
-               return ret;
-
-       SET_REFLINK_P_ERROR(&new->v, false);
-       ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
-       if (ret)
-               return ret;
-
-       if (!should_commit)
-               return 0;
-
-       return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-               -BCH_ERR_transaction_restart_nested;
-}
-
-static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
-                                             struct bkey_s_c_reflink_p p,
-                                             u64 missing_start, u64 missing_end,
-                                             bool should_commit)
-{
-       if (REFLINK_P_ERROR(p.v))
-               return 0;
-
-       struct bch_fs *c = trans->c;
-       u64 live_start  = REFLINK_P_IDX(p.v);
-       u64 live_end    = REFLINK_P_IDX(p.v) + p.k->size;
-       u64 refd_start  = live_start    - le32_to_cpu(p.v->front_pad);
-       u64 refd_end    = live_end      + le32_to_cpu(p.v->back_pad);
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-
-       BUG_ON(missing_start    < refd_start);
-       BUG_ON(missing_end      > refd_end);
-
-       struct bpos missing_pos = bkey_start_pos(p.k);
-       missing_pos.offset += missing_start - live_start;
-
-       prt_printf(&buf, "pointer to missing indirect extent in ");
-       ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos);
-       if (ret)
-               goto err;
-
-       prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9);
-       bch2_bkey_val_to_text(&buf, c, p.s_c);
-
-       prt_printf(&buf, "\nmissing reflink btree range %llu-%llu",
-                  missing_start, missing_end);
-
-       if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) {
-               struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
-               ret = PTR_ERR_OR_ZERO(new);
-               if (ret)
-                       goto err;
-
-               /*
-                * Is the missing range not actually needed?
-                *
-                * p.v->idx refers to the data that we actually want, but if the
-                * indirect extent we point to was bigger, front_pad and back_pad
-                * indicate the range we took a reference on.
-                */
-
-               if (missing_end <= live_start) {
-                       new->v.front_pad = cpu_to_le32(live_start - missing_end);
-               } else if (missing_start >= live_end) {
-                       new->v.back_pad = cpu_to_le32(missing_start - live_end);
-               } else {
-                       struct bpos new_start   = bkey_start_pos(&new->k);
-                       struct bpos new_end     = new->k.p;
-
-                       if (missing_start > live_start)
-                               new_start.offset += missing_start - live_start;
-                       if (missing_end < live_end)
-                               new_end.offset -= live_end - missing_end;
-
-                       bch2_cut_front(new_start, &new->k_i);
-                       bch2_cut_back(new_end, &new->k_i);
-
-                       SET_REFLINK_P_ERROR(&new->v, true);
-               }
-
-               ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
-               if (ret)
-                       goto err;
-
-               if (should_commit)
-                       ret =   bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-                               -BCH_ERR_transaction_restart_nested;
-       }
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-/*
- * This is used from the read path, which doesn't expect to have to do a
- * transaction commit, and from triggers, which should not be doing a commit:
- */
-struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
-                                           struct btree_iter *iter,
-                                           s64 *offset_into_extent,
-                                           struct bkey_s_c_reflink_p p,
-                                           bool should_commit,
-                                           unsigned iter_flags)
-{
-       BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad)));
-       BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad));
-
-       u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent;
-
-       struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink,
-                                      POS(0, reflink_offset), iter_flags);
-       if (bkey_err(k))
-               return k;
-
-       if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
-               u64 missing_end = min(k.k->p.offset,
-                                     REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad));
-               BUG_ON(reflink_offset == missing_end);
-
-               int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
-                                                            missing_end, should_commit);
-               if (ret) {
-                       bch2_trans_iter_exit(trans, iter);
-                       return bkey_s_c_err(ret);
-               }
-       } else if (unlikely(REFLINK_P_ERROR(p.v))) {
-               int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
-               if (ret) {
-                       bch2_trans_iter_exit(trans, iter);
-                       return bkey_s_c_err(ret);
-               }
-       }
-
-       *offset_into_extent = reflink_offset - bkey_start_offset(k.k);
-       return k;
-}
-
-/* reflink pointer trigger */
-
-static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
-                       struct bkey_s_c_reflink_p p, u64 *idx,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-
-       s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v);
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false,
-                                                       BTREE_ITER_intent|
-                                                       BTREE_ITER_with_updates);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       if (!bkey_refcount_c(k)) {
-               if (!(flags & BTREE_TRIGGER_overwrite))
-                       ret = bch_err_throw(c, missing_indirect_extent);
-               goto next;
-       }
-
-       struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-       ret = PTR_ERR_OR_ZERO(new);
-       if (ret)
-               goto err;
-
-       __le64 *refcount = bkey_refcount(bkey_i_to_s(new));
-       if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
-               bch2_bkey_val_to_text(&buf, c, p.s_c);
-               prt_newline(&buf);
-               bch2_bkey_val_to_text(&buf, c, k);
-               log_fsck_err(trans, reflink_refcount_underflow,
-                            "indirect extent refcount underflow while marking\n%s",
-                          buf.buf);
-               goto next;
-       }
-
-       if (flags & BTREE_TRIGGER_insert) {
-               struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
-               u64 pad;
-
-               pad = max_t(s64, le32_to_cpu(v->front_pad),
-                           REFLINK_P_IDX(v) - bkey_start_offset(&new->k));
-               BUG_ON(pad > U32_MAX);
-               v->front_pad = cpu_to_le32(pad);
-
-               pad = max_t(s64, le32_to_cpu(v->back_pad),
-                           new->k.p.offset - p.k->size - REFLINK_P_IDX(v));
-               BUG_ON(pad > U32_MAX);
-               v->back_pad = cpu_to_le32(pad);
-       }
-
-       le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1);
-
-       bch2_btree_iter_set_pos_to_extent_start(&iter);
-       ret = bch2_trans_update(trans, &iter, new, 0);
-       if (ret)
-               goto err;
-next:
-       *idx = k.k->p.offset;
-err:
-fsck_err:
-       bch2_trans_iter_exit(trans, &iter);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
-                               struct bkey_s_c_reflink_p p, u64 *idx,
-                               enum btree_iter_update_trigger_flags flags,
-                               size_t r_idx)
-{
-       struct bch_fs *c = trans->c;
-       struct reflink_gc *r;
-       int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
-       u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
-       s64 ret = 0;
-       struct printbuf buf = PRINTBUF;
-
-       if (r_idx >= c->reflink_gc_nr)
-               goto not_found;
-
-       r = genradix_ptr(&c->reflink_gc_table, r_idx);
-       next_idx = min(next_idx, r->offset - r->size);
-       if (*idx < next_idx)
-               goto not_found;
-
-       BUG_ON((s64) r->refcount + add < 0);
-
-       if (flags & BTREE_TRIGGER_gc)
-               r->refcount += add;
-       *idx = r->offset;
-       return 0;
-not_found:
-       if (flags & BTREE_TRIGGER_check_repair) {
-               ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
-               if (ret)
-                       goto err;
-       }
-
-       *idx = next_idx;
-err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int __trigger_reflink_p(struct btree_trans *trans,
-               enum btree_id btree_id, unsigned level, struct bkey_s_c k,
-               enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-       int ret = 0;
-
-       u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
-       u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
-
-       if (flags & BTREE_TRIGGER_transactional) {
-               while (idx < end && !ret)
-                       ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
-       }
-
-       if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) {
-               size_t l = 0, r = c->reflink_gc_nr;
-
-               while (l < r) {
-                       size_t m = l + (r - l) / 2;
-                       struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m);
-                       if (ref->offset <= idx)
-                               l = m + 1;
-                       else
-                               r = m;
-               }
-
-               while (idx < end && !ret)
-                       ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++);
-       }
-
-       return ret;
-}
-
-int bch2_trigger_reflink_p(struct btree_trans *trans,
-                          enum btree_id btree_id, unsigned level,
-                          struct bkey_s_c old,
-                          struct bkey_s new,
-                          enum btree_iter_update_trigger_flags flags)
-{
-       if ((flags & BTREE_TRIGGER_transactional) &&
-           (flags & BTREE_TRIGGER_insert)) {
-               struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
-
-               v->front_pad = v->back_pad = 0;
-       }
-
-       return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
-}
-
-/* indirect extent trigger */
-
-static inline void
-check_indirect_extent_deleting(struct bkey_s new,
-                              enum btree_iter_update_trigger_flags *flags)
-{
-       if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) {
-               new.k->type = KEY_TYPE_deleted;
-               new.k->size = 0;
-               set_bkey_val_u64s(new.k, 0);
-               *flags &= ~BTREE_TRIGGER_insert;
-       }
-}
-
-int bch2_trigger_reflink_v(struct btree_trans *trans,
-                          enum btree_id btree_id, unsigned level,
-                          struct bkey_s_c old, struct bkey_s new,
-                          enum btree_iter_update_trigger_flags flags)
-{
-       if ((flags & BTREE_TRIGGER_transactional) &&
-           (flags & BTREE_TRIGGER_insert))
-               check_indirect_extent_deleting(new, &flags);
-
-       return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
-}
-
-int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
-                             enum btree_id btree_id, unsigned level,
-                             struct bkey_s_c old, struct bkey_s new,
-                             enum btree_iter_update_trigger_flags flags)
-{
-       check_indirect_extent_deleting(new, &flags);
-
-       return 0;
-}
-
-/* create */
-
-static int bch2_make_extent_indirect(struct btree_trans *trans,
-                                    struct btree_iter *extent_iter,
-                                    struct bkey_i *orig,
-                                    bool reflink_p_may_update_opts_field)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter reflink_iter = {};
-       struct bkey_s_c k;
-       struct bkey_i *r_v;
-       struct bkey_i_reflink_p *r_p;
-       __le64 *refcount;
-       int ret;
-
-       if (orig->k.type == KEY_TYPE_inline_data)
-               bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
-
-       bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
-                            BTREE_ITER_intent);
-       k = bch2_btree_iter_peek_prev(trans, &reflink_iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       /*
-        * XXX: we're assuming that 56 bits will be enough for the life of the
-        * filesystem: we need to implement wraparound, with a cursor in the
-        * logged ops btree:
-        */
-       if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size)))
-               return -ENOSPC;
-
-       r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
-       ret = PTR_ERR_OR_ZERO(r_v);
-       if (ret)
-               goto err;
-
-       bkey_init(&r_v->k);
-       r_v->k.type     = bkey_type_to_indirect(&orig->k);
-       r_v->k.p        = reflink_iter.pos;
-       bch2_key_resize(&r_v->k, orig->k.size);
-       r_v->k.bversion = orig->k.bversion;
-
-       set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
-
-       refcount        = bkey_refcount(bkey_i_to_s(r_v));
-       *refcount       = 0;
-       memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
-
-       ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
-       if (ret)
-               goto err;
-
-       /*
-        * orig is in a bkey_buf which statically allocates 5 64s for the val,
-        * so we know it will be big enough:
-        */
-       orig->k.type = KEY_TYPE_reflink_p;
-       r_p = bkey_i_to_reflink_p(orig);
-       set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
-
-       /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */
-#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
-       __underlying_memset(&r_p->v, 0, sizeof(r_p->v));
-#else
-       memset(&r_p->v, 0, sizeof(r_p->v));
-#endif
-
-       SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k));
-
-       if (reflink_p_may_update_opts_field)
-               SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true);
-
-       ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
-                               BTREE_UPDATE_internal_snapshot_node);
-err:
-       bch2_trans_iter_exit(trans, &reflink_iter);
-
-       return ret;
-}
-
-static struct bkey_s_c get_next_src(struct btree_trans *trans,
-                                   struct btree_iter *iter, struct bpos end)
-{
-       struct bkey_s_c k;
-       int ret;
-
-       for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) {
-               if (bkey_extent_is_unwritten(k))
-                       continue;
-
-               if (bkey_extent_is_data(k.k))
-                       return k;
-       }
-
-       if (bkey_ge(iter->pos, end))
-               bch2_btree_iter_set_pos(trans, iter, end);
-       return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-}
-
-s64 bch2_remap_range(struct bch_fs *c,
-                    subvol_inum dst_inum, u64 dst_offset,
-                    subvol_inum src_inum, u64 src_offset,
-                    u64 remap_sectors,
-                    u64 new_i_size, s64 *i_sectors_delta,
-                    bool may_change_src_io_path_opts)
-{
-       struct btree_trans *trans;
-       struct btree_iter dst_iter, src_iter;
-       struct bkey_s_c src_k;
-       struct bkey_buf new_dst, new_src;
-       struct bpos dst_start = POS(dst_inum.inum, dst_offset);
-       struct bpos src_start = POS(src_inum.inum, src_offset);
-       struct bpos dst_end = dst_start, src_end = src_start;
-       struct bch_io_opts opts;
-       struct bpos src_want;
-       u64 dst_done = 0;
-       u32 dst_snapshot, src_snapshot;
-       bool reflink_p_may_update_opts_field =
-               !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
-       int ret = 0, ret2 = 0;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink))
-               return bch_err_throw(c, erofs_no_writes);
-
-       bch2_check_set_feature(c, BCH_FEATURE_reflink);
-
-       dst_end.offset += remap_sectors;
-       src_end.offset += remap_sectors;
-
-       bch2_bkey_buf_init(&new_dst);
-       bch2_bkey_buf_init(&new_src);
-       trans = bch2_trans_get(c);
-
-       ret = bch2_inum_opts_get(trans, src_inum, &opts);
-       if (ret)
-               goto err;
-
-       bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
-                            BTREE_ITER_intent);
-       bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
-                            BTREE_ITER_intent);
-
-       while ((ret == 0 ||
-               bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
-              bkey_lt(dst_iter.pos, dst_end)) {
-               struct disk_reservation disk_res = { 0 };
-
-               bch2_trans_begin(trans);
-
-               if (fatal_signal_pending(current)) {
-                       ret = -EINTR;
-                       break;
-               }
-
-               ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
-                                                 &src_snapshot);
-               if (ret)
-                       continue;
-
-               bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot);
-
-               ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
-                                                 &dst_snapshot);
-               if (ret)
-                       continue;
-
-               bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot);
-
-               if (dst_inum.inum < src_inum.inum) {
-                       /* Avoid some lock cycle transaction restarts */
-                       ret = bch2_btree_iter_traverse(trans, &dst_iter);
-                       if (ret)
-                               continue;
-               }
-
-               dst_done = dst_iter.pos.offset - dst_start.offset;
-               src_want = POS(src_start.inode, src_start.offset + dst_done);
-               bch2_btree_iter_set_pos(trans, &src_iter, src_want);
-
-               src_k = get_next_src(trans, &src_iter, src_end);
-               ret = bkey_err(src_k);
-               if (ret)
-                       continue;
-
-               if (bkey_lt(src_want, src_iter.pos)) {
-                       ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
-                                       min(dst_end.offset,
-                                           dst_iter.pos.offset +
-                                           src_iter.pos.offset - src_want.offset),
-                                       i_sectors_delta);
-                       continue;
-               }
-
-               if (src_k.k->type != KEY_TYPE_reflink_p) {
-                       bch2_btree_iter_set_pos_to_extent_start(&src_iter);
-
-                       bch2_bkey_buf_reassemble(&new_src, c, src_k);
-                       src_k = bkey_i_to_s_c(new_src.k);
-
-                       ret = bch2_make_extent_indirect(trans, &src_iter,
-                                               new_src.k,
-                                               reflink_p_may_update_opts_field);
-                       if (ret)
-                               continue;
-
-                       BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
-               }
-
-               if (src_k.k->type == KEY_TYPE_reflink_p) {
-                       struct bkey_s_c_reflink_p src_p =
-                               bkey_s_c_to_reflink_p(src_k);
-                       struct bkey_i_reflink_p *dst_p =
-                               bkey_reflink_p_init(new_dst.k);
-
-                       u64 offset = REFLINK_P_IDX(src_p.v) +
-                               (src_want.offset -
-                                bkey_start_offset(src_k.k));
-
-                       SET_REFLINK_P_IDX(&dst_p->v, offset);
-
-                       if (reflink_p_may_update_opts_field &&
-                           may_change_src_io_path_opts &&
-                           REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v))
-                               SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
-               } else {
-                       BUG();
-               }
-
-               new_dst.k->k.p = dst_iter.pos;
-               bch2_key_resize(&new_dst.k->k,
-                               min(src_k.k->p.offset - src_want.offset,
-                                   dst_end.offset - dst_iter.pos.offset));
-
-               ret =   bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?:
-                       bch2_extent_update(trans, dst_inum, &dst_iter,
-                                       new_dst.k, &disk_res,
-                                       new_i_size, i_sectors_delta,
-                                       true);
-               bch2_disk_reservation_put(c, &disk_res);
-       }
-       bch2_trans_iter_exit(trans, &dst_iter);
-       bch2_trans_iter_exit(trans, &src_iter);
-
-       BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
-       BUG_ON(bkey_gt(dst_iter.pos, dst_end));
-
-       dst_done = dst_iter.pos.offset - dst_start.offset;
-       new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
-
-       do {
-               struct bch_inode_unpacked inode_u;
-               struct btree_iter inode_iter = {};
-
-               bch2_trans_begin(trans);
-
-               ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
-                                      dst_inum, BTREE_ITER_intent);
-
-               if (!ret2 &&
-                   inode_u.bi_size < new_i_size) {
-                       inode_u.bi_size = new_i_size;
-                       ret2  = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
-                               bch2_trans_commit(trans, NULL, NULL,
-                                                 BCH_TRANS_COMMIT_no_enospc);
-               }
-
-               bch2_trans_iter_exit(trans, &inode_iter);
-       } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-err:
-       bch2_trans_put(trans);
-       bch2_bkey_buf_exit(&new_src, c);
-       bch2_bkey_buf_exit(&new_dst, c);
-
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink);
-
-       return dst_done ?: ret ?: ret2;
-}
-
-/* fsck */
-
-static int bch2_gc_write_reflink_key(struct btree_trans *trans,
-                                    struct btree_iter *iter,
-                                    struct bkey_s_c k,
-                                    size_t *idx)
-{
-       struct bch_fs *c = trans->c;
-       const __le64 *refcount = bkey_refcount_c(k);
-       struct printbuf buf = PRINTBUF;
-       struct reflink_gc *r;
-       int ret = 0;
-
-       if (!refcount)
-               return 0;
-
-       while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
-              r->offset < k.k->p.offset)
-               ++*idx;
-
-       if (!r ||
-           r->offset != k.k->p.offset ||
-           r->size != k.k->size) {
-               bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-               return -EINVAL;
-       }
-
-       if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
-                       trans, reflink_v_refcount_wrong,
-                       "reflink key has wrong refcount:\n"
-                       "%s\n"
-                       "should be %u",
-                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-                       r->refcount)) {
-               struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-               ret = PTR_ERR_OR_ZERO(new);
-               if (ret)
-                       goto out;
-
-               if (!r->refcount)
-                       new->k.type = KEY_TYPE_deleted;
-               else
-                       *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
-               ret = bch2_trans_update(trans, iter, new, 0);
-       }
-out:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_gc_reflink_done(struct bch_fs *c)
-{
-       size_t idx = 0;
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter,
-                               BTREE_ID_reflink, POS_MIN,
-                               BTREE_ITER_prefetch, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
-       c->reflink_gc_nr = 0;
-       return ret;
-}
-
-int bch2_gc_reflink_start(struct bch_fs *c)
-{
-       c->reflink_gc_nr = 0;
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
-                                  BTREE_ITER_prefetch, k, ({
-                       const __le64 *refcount = bkey_refcount_c(k);
-
-                       if (!refcount)
-                               continue;
-
-                       struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
-                                                       c->reflink_gc_nr++, GFP_KERNEL);
-                       if (!r) {
-                               ret = bch_err_throw(c, ENOMEM_gc_reflink_start);
-                               break;
-                       }
-
-                       r->offset       = k.k->p.offset;
-                       r->size         = k.k->size;
-                       r->refcount     = 0;
-                       0;
-               })));
-
-       bch_err_fn(c, ret);
-       return ret;
-}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
deleted file mode 100644 (file)
index 1632780..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REFLINK_H
-#define _BCACHEFS_REFLINK_H
-
-int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c,
-                           struct bkey_validate_context);
-void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
-                          struct bkey_s_c, struct bkey_s,
-                          enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_reflink_p ((struct bkey_ops) {           \
-       .key_validate   = bch2_reflink_p_validate,              \
-       .val_to_text    = bch2_reflink_p_to_text,               \
-       .key_merge      = bch2_reflink_p_merge,                 \
-       .trigger        = bch2_trigger_reflink_p,               \
-       .min_val_size   = 16,                                   \
-})
-
-int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c,
-                           struct bkey_validate_context);
-void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
-                          struct bkey_s_c, struct bkey_s,
-                          enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_reflink_v ((struct bkey_ops) {           \
-       .key_validate   = bch2_reflink_v_validate,              \
-       .val_to_text    = bch2_reflink_v_to_text,               \
-       .swab           = bch2_ptr_swab,                        \
-       .trigger        = bch2_trigger_reflink_v,               \
-       .min_val_size   = 8,                                    \
-})
-
-int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c,
-                                      struct bkey_validate_context);
-void bch2_indirect_inline_data_to_text(struct printbuf *,
-                               struct bch_fs *, struct bkey_s_c);
-int bch2_trigger_indirect_inline_data(struct btree_trans *,
-                                        enum btree_id, unsigned,
-                             struct bkey_s_c, struct bkey_s,
-                             enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) {        \
-       .key_validate   = bch2_indirect_inline_data_validate,   \
-       .val_to_text    = bch2_indirect_inline_data_to_text,    \
-       .trigger        = bch2_trigger_indirect_inline_data,    \
-       .min_val_size   = 8,                                    \
-})
-
-static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_reflink_v:
-               return &bkey_s_c_to_reflink_v(k).v->refcount;
-       case KEY_TYPE_indirect_inline_data:
-               return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
-       default:
-               return NULL;
-       }
-}
-
-static inline __le64 *bkey_refcount(struct bkey_s k)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_reflink_v:
-               return &bkey_s_to_reflink_v(k).v->refcount;
-       case KEY_TYPE_indirect_inline_data:
-               return &bkey_s_to_indirect_inline_data(k).v->refcount;
-       default:
-               return NULL;
-       }
-}
-
-struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *,
-                                           s64 *, struct bkey_s_c_reflink_p,
-                                           bool, unsigned);
-
-s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
-                    subvol_inum, u64, u64, u64, s64 *,
-                    bool);
-
-int bch2_gc_reflink_done(struct bch_fs *);
-int bch2_gc_reflink_start(struct bch_fs *);
-
-#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
deleted file mode 100644 (file)
index 92995e4..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REFLINK_FORMAT_H
-#define _BCACHEFS_REFLINK_FORMAT_H
-
-struct bch_reflink_p {
-       struct bch_val          v;
-       __le64                  idx_flags;
-       /*
-        * A reflink pointer might point to an indirect extent which is then
-        * later split (by copygc or rebalance). If we only pointed to part of
-        * the original indirect extent, and then one of the fragments is
-        * outside the range we point to, we'd leak a refcount: so when creating
-        * reflink pointers, we need to store pad values to remember the full
-        * range we were taking a reference on.
-        */
-       __le32                  front_pad;
-       __le32                  back_pad;
-} __packed __aligned(8);
-
-LE64_BITMASK(REFLINK_P_IDX,    struct bch_reflink_p, idx_flags,  0, 56);
-LE64_BITMASK(REFLINK_P_ERROR,  struct bch_reflink_p, idx_flags, 56, 57);
-LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS,
-                               struct bch_reflink_p, idx_flags, 57, 58);
-
-struct bch_reflink_v {
-       struct bch_val          v;
-       __le64                  refcount;
-       union bch_extent_entry  start[0];
-       __u64                   _data[];
-} __packed __aligned(8);
-
-struct bch_indirect_inline_data {
-       struct bch_val          v;
-       __le64                  refcount;
-       u8                      data[];
-};
-
-#endif /* _BCACHEFS_REFLINK_FORMAT_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
deleted file mode 100644 (file)
index 8383bd7..0000000
+++ /dev/null
@@ -1,918 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "buckets.h"
-#include "disk_accounting.h"
-#include "journal.h"
-#include "replicas.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
-                                           struct bch_replicas_cpu *);
-
-/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r,  const void *priv)
-{
-       size_t size = (size_t) priv;
-       return memcmp(l, r, size);
-}
-
-/* Replicas tracking - in memory: */
-
-static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       BUG_ON(!e->nr_devs);
-       BUG_ON(e->nr_required > 1 &&
-              e->nr_required >= e->nr_devs);
-
-       for (unsigned i = 0; i + 1 < e->nr_devs; i++)
-               BUG_ON(e->devs[i] >= e->devs[i + 1]);
-#endif
-}
-
-void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
-{
-       bubble_sort(e->devs, e->nr_devs, u8_cmp);
-}
-
-static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
-{
-       eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
-                         bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
-}
-
-static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
-                                          struct bch_replicas_entry_v0 *e)
-{
-       bch2_prt_data_type(out, e->data_type);
-
-       prt_printf(out, ": %u [", e->nr_devs);
-       for (unsigned i = 0; i < e->nr_devs; i++)
-               prt_printf(out, i ? " %u" : "%u", e->devs[i]);
-       prt_printf(out, "]");
-}
-
-void bch2_replicas_entry_to_text(struct printbuf *out,
-                                struct bch_replicas_entry_v1 *e)
-{
-       bch2_prt_data_type(out, e->data_type);
-
-       prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
-       for (unsigned i = 0; i < e->nr_devs; i++)
-               prt_printf(out, i ? " %u" : "%u", e->devs[i]);
-       prt_printf(out, "]");
-}
-
-static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
-                                          struct bch_sb *sb,
-                                          struct printbuf *err)
-{
-       if (!r->nr_devs) {
-               prt_printf(err, "no devices in entry ");
-               goto bad;
-       }
-
-       if (r->nr_required > 1 &&
-           r->nr_required >= r->nr_devs) {
-               prt_printf(err, "bad nr_required in entry ");
-               goto bad;
-       }
-
-       for (unsigned i = 0; i < r->nr_devs; i++)
-               if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
-                   !bch2_member_exists(sb, r->devs[i])) {
-                       prt_printf(err, "invalid device %u in entry ", r->devs[i]);
-                       goto bad;
-               }
-
-       return 0;
-bad:
-       bch2_replicas_entry_to_text(err, r);
-       return -BCH_ERR_invalid_replicas_entry;
-}
-
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
-                                struct bch_fs *c,
-                                struct printbuf *err)
-{
-       if (!r->nr_devs) {
-               prt_printf(err, "no devices in entry ");
-               goto bad;
-       }
-
-       if (r->nr_required > 1 &&
-           r->nr_required >= r->nr_devs) {
-               prt_printf(err, "bad nr_required in entry ");
-               goto bad;
-       }
-
-       for (unsigned i = 0; i < r->nr_devs; i++)
-               if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
-                   !bch2_dev_exists(c, r->devs[i])) {
-                       prt_printf(err, "invalid device %u in entry ", r->devs[i]);
-                       goto bad;
-               }
-
-       return 0;
-bad:
-       bch2_replicas_entry_to_text(err, r);
-       return bch_err_throw(c, invalid_replicas_entry);
-}
-
-void bch2_cpu_replicas_to_text(struct printbuf *out,
-                              struct bch_replicas_cpu *r)
-{
-       struct bch_replicas_entry_v1 *e;
-       bool first = true;
-
-       for_each_cpu_replicas_entry(r, e) {
-               if (!first)
-                       prt_printf(out, " ");
-               first = false;
-
-               bch2_replicas_entry_to_text(out, e);
-       }
-}
-
-static void extent_to_replicas(struct bkey_s_c k,
-                              struct bch_replicas_entry_v1 *r)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-
-       r->nr_required  = 1;
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               if (p.ptr.cached)
-                       continue;
-
-               if (!p.has_ec)
-                       replicas_entry_add_dev(r, p.ptr.dev);
-               else
-                       r->nr_required = 0;
-       }
-}
-
-static void stripe_to_replicas(struct bkey_s_c k,
-                              struct bch_replicas_entry_v1 *r)
-{
-       struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-       const struct bch_extent_ptr *ptr;
-
-       r->nr_required  = s.v->nr_blocks - s.v->nr_redundant;
-
-       for (ptr = s.v->ptrs;
-            ptr < s.v->ptrs + s.v->nr_blocks;
-            ptr++)
-               replicas_entry_add_dev(r, ptr->dev);
-}
-
-void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
-                          struct bkey_s_c k)
-{
-       e->nr_devs = 0;
-
-       switch (k.k->type) {
-       case KEY_TYPE_btree_ptr:
-       case KEY_TYPE_btree_ptr_v2:
-               e->data_type = BCH_DATA_btree;
-               extent_to_replicas(k, e);
-               break;
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reflink_v:
-               e->data_type = BCH_DATA_user;
-               extent_to_replicas(k, e);
-               break;
-       case KEY_TYPE_stripe:
-               e->data_type = BCH_DATA_parity;
-               stripe_to_replicas(k, e);
-               break;
-       }
-
-       bch2_replicas_entry_sort(e);
-}
-
-void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
-                             enum bch_data_type data_type,
-                             struct bch_devs_list devs)
-{
-       BUG_ON(!data_type ||
-              data_type == BCH_DATA_sb ||
-              data_type >= BCH_DATA_NR);
-
-       e->data_type    = data_type;
-       e->nr_devs      = 0;
-       e->nr_required  = 1;
-
-       darray_for_each(devs, i)
-               replicas_entry_add_dev(e, *i);
-
-       bch2_replicas_entry_sort(e);
-}
-
-static struct bch_replicas_cpu
-cpu_replicas_add_entry(struct bch_fs *c,
-                      struct bch_replicas_cpu *old,
-                      struct bch_replicas_entry_v1 *new_entry)
-{
-       struct bch_replicas_cpu new = {
-               .nr             = old->nr + 1,
-               .entry_size     = max_t(unsigned, old->entry_size,
-                                       replicas_entry_bytes(new_entry)),
-       };
-
-       new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
-       if (!new.entries)
-               return new;
-
-       for (unsigned i = 0; i < old->nr; i++)
-               memcpy(cpu_replicas_entry(&new, i),
-                      cpu_replicas_entry(old, i),
-                      old->entry_size);
-
-       memcpy(cpu_replicas_entry(&new, old->nr),
-              new_entry,
-              replicas_entry_bytes(new_entry));
-
-       bch2_cpu_replicas_sort(&new);
-       return new;
-}
-
-static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
-                                      struct bch_replicas_entry_v1 *search)
-{
-       int idx, entry_size = replicas_entry_bytes(search);
-
-       if (unlikely(entry_size > r->entry_size))
-               return -1;
-
-#define entry_cmp(_l, _r)      memcmp(_l, _r, entry_size)
-       idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
-                             entry_cmp, search);
-#undef entry_cmp
-
-       return idx < r->nr ? idx : -1;
-}
-
-int bch2_replicas_entry_idx(struct bch_fs *c,
-                           struct bch_replicas_entry_v1 *search)
-{
-       bch2_replicas_entry_sort(search);
-
-       return __replicas_entry_idx(&c->replicas, search);
-}
-
-static bool __replicas_has_entry(struct bch_replicas_cpu *r,
-                                struct bch_replicas_entry_v1 *search)
-{
-       return __replicas_entry_idx(r, search) >= 0;
-}
-
-bool bch2_replicas_marked_locked(struct bch_fs *c,
-                         struct bch_replicas_entry_v1 *search)
-{
-       verify_replicas_entry(search);
-
-       return !search->nr_devs ||
-               (__replicas_has_entry(&c->replicas, search) &&
-                (likely((!c->replicas_gc.entries)) ||
-                 __replicas_has_entry(&c->replicas_gc, search)));
-}
-
-bool bch2_replicas_marked(struct bch_fs *c,
-                         struct bch_replicas_entry_v1 *search)
-{
-       percpu_down_read(&c->mark_lock);
-       bool ret = bch2_replicas_marked_locked(c, search);
-       percpu_up_read(&c->mark_lock);
-
-       return ret;
-}
-
-noinline
-static int bch2_mark_replicas_slowpath(struct bch_fs *c,
-                               struct bch_replicas_entry_v1 *new_entry)
-{
-       struct bch_replicas_cpu new_r, new_gc;
-       int ret = 0;
-
-       verify_replicas_entry(new_entry);
-
-       memset(&new_r, 0, sizeof(new_r));
-       memset(&new_gc, 0, sizeof(new_gc));
-
-       mutex_lock(&c->sb_lock);
-
-       if (c->replicas_gc.entries &&
-           !__replicas_has_entry(&c->replicas_gc, new_entry)) {
-               new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
-               if (!new_gc.entries) {
-                       ret = bch_err_throw(c, ENOMEM_cpu_replicas);
-                       goto err;
-               }
-       }
-
-       if (!__replicas_has_entry(&c->replicas, new_entry)) {
-               new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
-               if (!new_r.entries) {
-                       ret = bch_err_throw(c, ENOMEM_cpu_replicas);
-                       goto err;
-               }
-
-               ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
-               if (ret)
-                       goto err;
-       }
-
-       if (!new_r.entries &&
-           !new_gc.entries)
-               goto out;
-
-       /* allocations done, now commit: */
-
-       if (new_r.entries)
-               bch2_write_super(c);
-
-       /* don't update in memory replicas until changes are persistent */
-       percpu_down_write(&c->mark_lock);
-       if (new_r.entries)
-               swap(c->replicas, new_r);
-       if (new_gc.entries)
-               swap(new_gc, c->replicas_gc);
-       percpu_up_write(&c->mark_lock);
-out:
-       mutex_unlock(&c->sb_lock);
-
-       kfree(new_r.entries);
-       kfree(new_gc.entries);
-
-       return ret;
-err:
-       bch_err_msg(c, ret, "adding replicas entry");
-       goto out;
-}
-
-int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
-{
-       return likely(bch2_replicas_marked(c, r))
-               ? 0 : bch2_mark_replicas_slowpath(c, r);
-}
-
-/*
- * Old replicas_gc mechanism: only used for journal replicas entries now, should
- * die at some point:
- */
-
-int bch2_replicas_gc_end(struct bch_fs *c, int ret)
-{
-       lockdep_assert_held(&c->replicas_gc_lock);
-
-       mutex_lock(&c->sb_lock);
-       percpu_down_write(&c->mark_lock);
-
-       ret =   ret ?:
-               bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
-       if (!ret)
-               swap(c->replicas, c->replicas_gc);
-
-       kfree(c->replicas_gc.entries);
-       c->replicas_gc.entries = NULL;
-
-       percpu_up_write(&c->mark_lock);
-
-       if (!ret)
-               bch2_write_super(c);
-
-       mutex_unlock(&c->sb_lock);
-
-       return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
-       struct bch_replicas_entry_v1 *e;
-       unsigned i = 0;
-
-       lockdep_assert_held(&c->replicas_gc_lock);
-
-       mutex_lock(&c->sb_lock);
-       BUG_ON(c->replicas_gc.entries);
-
-       c->replicas_gc.nr               = 0;
-       c->replicas_gc.entry_size       = 0;
-
-       for_each_cpu_replicas_entry(&c->replicas, e) {
-               /* Preserve unknown data types */
-               if (e->data_type >= BCH_DATA_NR ||
-                   !((1 << e->data_type) & typemask)) {
-                       c->replicas_gc.nr++;
-                       c->replicas_gc.entry_size =
-                               max_t(unsigned, c->replicas_gc.entry_size,
-                                     replicas_entry_bytes(e));
-               }
-       }
-
-       c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
-                                        c->replicas_gc.entry_size,
-                                        GFP_KERNEL);
-       if (!c->replicas_gc.entries) {
-               mutex_unlock(&c->sb_lock);
-               bch_err(c, "error allocating c->replicas_gc");
-               return bch_err_throw(c, ENOMEM_replicas_gc);
-       }
-
-       for_each_cpu_replicas_entry(&c->replicas, e)
-               if (e->data_type >= BCH_DATA_NR ||
-                   !((1 << e->data_type) & typemask))
-                       memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
-                              e, c->replicas_gc.entry_size);
-
-       bch2_cpu_replicas_sort(&c->replicas_gc);
-       mutex_unlock(&c->sb_lock);
-
-       return 0;
-}
-
-/*
- * New much simpler mechanism for clearing out unneeded replicas entries - drop
- * replicas entries that have 0 sectors used.
- *
- * However, we don't track sector counts for journal usage, so this doesn't drop
- * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
- * is retained for that.
- */
-int bch2_replicas_gc2(struct bch_fs *c)
-{
-       struct bch_replicas_cpu new = { 0 };
-       unsigned nr;
-       int ret = 0;
-
-       bch2_accounting_mem_gc(c);
-retry:
-       nr              = READ_ONCE(c->replicas.nr);
-       new.entry_size  = READ_ONCE(c->replicas.entry_size);
-       new.entries     = kcalloc(nr, new.entry_size, GFP_KERNEL);
-       if (!new.entries) {
-               bch_err(c, "error allocating c->replicas_gc");
-               return bch_err_throw(c, ENOMEM_replicas_gc);
-       }
-
-       mutex_lock(&c->sb_lock);
-       percpu_down_write(&c->mark_lock);
-
-       if (nr                  != c->replicas.nr ||
-           new.entry_size      != c->replicas.entry_size) {
-               percpu_up_write(&c->mark_lock);
-               mutex_unlock(&c->sb_lock);
-               kfree(new.entries);
-               goto retry;
-       }
-
-       for (unsigned i = 0; i < c->replicas.nr; i++) {
-               struct bch_replicas_entry_v1 *e =
-                       cpu_replicas_entry(&c->replicas, i);
-
-               struct disk_accounting_pos k = {
-                       .type = BCH_DISK_ACCOUNTING_replicas,
-               };
-
-               unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
-                             "embedded variable length struct");
-
-               struct bpos p = disk_accounting_pos_to_bpos(&k);
-
-               struct bch_accounting_mem *acc = &c->accounting;
-               bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-                                           accounting_pos_cmp, &p) >= acc->k.nr;
-
-               if (e->data_type == BCH_DATA_journal || !kill)
-                       memcpy(cpu_replicas_entry(&new, new.nr++),
-                              e, new.entry_size);
-       }
-
-       bch2_cpu_replicas_sort(&new);
-
-       ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
-
-       if (!ret)
-               swap(c->replicas, new);
-
-       kfree(new.entries);
-
-       percpu_up_write(&c->mark_lock);
-
-       if (!ret)
-               bch2_write_super(c);
-
-       mutex_unlock(&c->sb_lock);
-
-       return ret;
-}
-
-/* Replicas tracking - superblock: */
-
-static int
-__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
-                                  struct bch_replicas_cpu *cpu_r)
-{
-       struct bch_replicas_entry_v1 *e, *dst;
-       unsigned nr = 0, entry_size = 0, idx = 0;
-
-       for_each_replicas_entry(sb_r, e) {
-               entry_size = max_t(unsigned, entry_size,
-                                  replicas_entry_bytes(e));
-               nr++;
-       }
-
-       cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
-       if (!cpu_r->entries)
-               return -BCH_ERR_ENOMEM_cpu_replicas;
-
-       cpu_r->nr               = nr;
-       cpu_r->entry_size       = entry_size;
-
-       for_each_replicas_entry(sb_r, e) {
-               dst = cpu_replicas_entry(cpu_r, idx++);
-               memcpy(dst, e, replicas_entry_bytes(e));
-               bch2_replicas_entry_sort(dst);
-       }
-
-       return 0;
-}
-
-static int
-__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
-                                     struct bch_replicas_cpu *cpu_r)
-{
-       struct bch_replicas_entry_v0 *e;
-       unsigned nr = 0, entry_size = 0, idx = 0;
-
-       for_each_replicas_entry(sb_r, e) {
-               entry_size = max_t(unsigned, entry_size,
-                                  replicas_entry_bytes(e));
-               nr++;
-       }
-
-       entry_size += sizeof(struct bch_replicas_entry_v1) -
-               sizeof(struct bch_replicas_entry_v0);
-
-       cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
-       if (!cpu_r->entries)
-               return -BCH_ERR_ENOMEM_cpu_replicas;
-
-       cpu_r->nr               = nr;
-       cpu_r->entry_size       = entry_size;
-
-       for_each_replicas_entry(sb_r, e) {
-               struct bch_replicas_entry_v1 *dst =
-                       cpu_replicas_entry(cpu_r, idx++);
-
-               dst->data_type  = e->data_type;
-               dst->nr_devs    = e->nr_devs;
-               dst->nr_required = 1;
-               memcpy(dst->devs, e->devs, e->nr_devs);
-               bch2_replicas_entry_sort(dst);
-       }
-
-       return 0;
-}
-
-int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
-{
-       struct bch_sb_field_replicas *sb_v1;
-       struct bch_sb_field_replicas_v0 *sb_v0;
-       struct bch_replicas_cpu new_r = { 0, 0, NULL };
-       int ret = 0;
-
-       if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
-               ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
-       else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
-               ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
-       if (ret)
-               return ret;
-
-       bch2_cpu_replicas_sort(&new_r);
-
-       percpu_down_write(&c->mark_lock);
-       swap(c->replicas, new_r);
-       percpu_up_write(&c->mark_lock);
-
-       kfree(new_r.entries);
-
-       return 0;
-}
-
-static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
-                                              struct bch_replicas_cpu *r)
-{
-       struct bch_sb_field_replicas_v0 *sb_r;
-       struct bch_replicas_entry_v0 *dst;
-       struct bch_replicas_entry_v1 *src;
-       size_t bytes;
-
-       bytes = sizeof(struct bch_sb_field_replicas);
-
-       for_each_cpu_replicas_entry(r, src)
-               bytes += replicas_entry_bytes(src) - 1;
-
-       sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
-                       DIV_ROUND_UP(bytes, sizeof(u64)));
-       if (!sb_r)
-               return bch_err_throw(c, ENOSPC_sb_replicas);
-
-       bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
-       sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
-
-       memset(&sb_r->entries, 0,
-              vstruct_end(&sb_r->field) -
-              (void *) &sb_r->entries);
-
-       dst = sb_r->entries;
-       for_each_cpu_replicas_entry(r, src) {
-               dst->data_type  = src->data_type;
-               dst->nr_devs    = src->nr_devs;
-               memcpy(dst->devs, src->devs, src->nr_devs);
-
-               dst = replicas_entry_next(dst);
-
-               BUG_ON((void *) dst > vstruct_end(&sb_r->field));
-       }
-
-       return 0;
-}
-
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
-                                           struct bch_replicas_cpu *r)
-{
-       struct bch_sb_field_replicas *sb_r;
-       struct bch_replicas_entry_v1 *dst, *src;
-       bool need_v1 = false;
-       size_t bytes;
-
-       bytes = sizeof(struct bch_sb_field_replicas);
-
-       for_each_cpu_replicas_entry(r, src) {
-               bytes += replicas_entry_bytes(src);
-               if (src->nr_required != 1)
-                       need_v1 = true;
-       }
-
-       if (!need_v1)
-               return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
-
-       sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
-                       DIV_ROUND_UP(bytes, sizeof(u64)));
-       if (!sb_r)
-               return bch_err_throw(c, ENOSPC_sb_replicas);
-
-       bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
-       sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
-
-       memset(&sb_r->entries, 0,
-              vstruct_end(&sb_r->field) -
-              (void *) &sb_r->entries);
-
-       dst = sb_r->entries;
-       for_each_cpu_replicas_entry(r, src) {
-               memcpy(dst, src, replicas_entry_bytes(src));
-
-               dst = replicas_entry_next(dst);
-
-               BUG_ON((void *) dst > vstruct_end(&sb_r->field));
-       }
-
-       return 0;
-}
-
-static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
-                                     struct bch_sb *sb,
-                                     struct printbuf *err)
-{
-       unsigned i;
-
-       sort_r(cpu_r->entries,
-              cpu_r->nr,
-              cpu_r->entry_size,
-              bch2_memcmp, NULL,
-              (void *)(size_t)cpu_r->entry_size);
-
-       for (i = 0; i < cpu_r->nr; i++) {
-               struct bch_replicas_entry_v1 *e =
-                       cpu_replicas_entry(cpu_r, i);
-
-               int ret = bch2_replicas_entry_sb_validate(e, sb, err);
-               if (ret)
-                       return ret;
-
-               if (i + 1 < cpu_r->nr) {
-                       struct bch_replicas_entry_v1 *n =
-                               cpu_replicas_entry(cpu_r, i + 1);
-
-                       BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
-
-                       if (!memcmp(e, n, cpu_r->entry_size)) {
-                               prt_printf(err, "duplicate replicas entry ");
-                               bch2_replicas_entry_to_text(err, e);
-                               return -BCH_ERR_invalid_sb_replicas;
-                       }
-               }
-       }
-
-       return 0;
-}
-
-static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                    enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
-       struct bch_replicas_cpu cpu_r;
-       int ret;
-
-       ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
-       if (ret)
-               return ret;
-
-       ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
-       kfree(cpu_r.entries);
-       return ret;
-}
-
-static void bch2_sb_replicas_to_text(struct printbuf *out,
-                                    struct bch_sb *sb,
-                                    struct bch_sb_field *f)
-{
-       struct bch_sb_field_replicas *r = field_to_type(f, replicas);
-       struct bch_replicas_entry_v1 *e;
-       bool first = true;
-
-       for_each_replicas_entry(r, e) {
-               if (!first)
-                       prt_printf(out, " ");
-               first = false;
-
-               bch2_replicas_entry_to_text(out, e);
-       }
-       prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
-       .validate       = bch2_sb_replicas_validate,
-       .to_text        = bch2_sb_replicas_to_text,
-};
-
-static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                       enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
-       struct bch_replicas_cpu cpu_r;
-       int ret;
-
-       ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
-       if (ret)
-               return ret;
-
-       ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
-       kfree(cpu_r.entries);
-       return ret;
-}
-
-static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
-                                       struct bch_sb *sb,
-                                       struct bch_sb_field *f)
-{
-       struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
-       struct bch_replicas_entry_v0 *e;
-       bool first = true;
-
-       for_each_replicas_entry(sb_r, e) {
-               if (!first)
-                       prt_printf(out, " ");
-               first = false;
-
-               bch2_replicas_entry_v0_to_text(out, e);
-       }
-       prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
-       .validate       = bch2_sb_replicas_v0_validate,
-       .to_text        = bch2_sb_replicas_v0_to_text,
-};
-
-/* Query replicas: */
-
-bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
-                          unsigned flags, bool print)
-{
-       struct bch_replicas_entry_v1 *e;
-       bool ret = true;
-
-       percpu_down_read(&c->mark_lock);
-       for_each_cpu_replicas_entry(&c->replicas, e) {
-               unsigned nr_online = 0, nr_failed = 0, dflags = 0;
-               bool metadata = e->data_type < BCH_DATA_user;
-
-               if (e->data_type == BCH_DATA_cached)
-                       continue;
-
-               scoped_guard(rcu)
-                       for (unsigned i = 0; i < e->nr_devs; i++) {
-                               if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
-                                       nr_failed++;
-                                       continue;
-                               }
-
-                               nr_online += test_bit(e->devs[i], devs.d);
-
-                               struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
-                               nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
-                       }
-
-               if (nr_online + nr_failed == e->nr_devs)
-                       continue;
-
-               if (nr_online < e->nr_required)
-                       dflags |= metadata
-                               ? BCH_FORCE_IF_METADATA_LOST
-                               : BCH_FORCE_IF_DATA_LOST;
-
-               if (nr_online < e->nr_devs)
-                       dflags |= metadata
-                               ? BCH_FORCE_IF_METADATA_DEGRADED
-                               : BCH_FORCE_IF_DATA_DEGRADED;
-
-               if (dflags & ~flags) {
-                       if (print) {
-                               struct printbuf buf = PRINTBUF;
-
-                               bch2_replicas_entry_to_text(&buf, e);
-                               bch_err(c, "insufficient devices online (%u) for replicas entry %s",
-                                       nr_online, buf.buf);
-                               printbuf_exit(&buf);
-                       }
-                       ret = false;
-                       break;
-               }
-
-       }
-       percpu_up_read(&c->mark_lock);
-
-       return ret;
-}
-
-unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
-{
-       struct bch_sb_field_replicas *replicas;
-       struct bch_sb_field_replicas_v0 *replicas_v0;
-       unsigned data_has = 0;
-
-       replicas = bch2_sb_field_get(sb, replicas);
-       replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
-
-       if (replicas) {
-               struct bch_replicas_entry_v1 *r;
-
-               for_each_replicas_entry(replicas, r) {
-                       if (r->data_type >= sizeof(data_has) * 8)
-                               continue;
-
-                       for (unsigned i = 0; i < r->nr_devs; i++)
-                               if (r->devs[i] == dev)
-                                       data_has |= 1 << r->data_type;
-               }
-
-       } else if (replicas_v0) {
-               struct bch_replicas_entry_v0 *r;
-
-               for_each_replicas_entry_v0(replicas_v0, r) {
-                       if (r->data_type >= sizeof(data_has) * 8)
-                               continue;
-
-                       for (unsigned i = 0; i < r->nr_devs; i++)
-                               if (r->devs[i] == dev)
-                                       data_has |= 1 << r->data_type;
-               }
-       }
-
-
-       return data_has;
-}
-
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
-{
-       mutex_lock(&c->sb_lock);
-       unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
-       mutex_unlock(&c->sb_lock);
-
-       return ret;
-}
-
-void bch2_fs_replicas_exit(struct bch_fs *c)
-{
-       kfree(c->replicas.entries);
-       kfree(c->replicas_gc.entries);
-}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
deleted file mode 100644 (file)
index 5aba2c1..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_H
-#define _BCACHEFS_REPLICAS_H
-
-#include "bkey.h"
-#include "eytzinger.h"
-#include "replicas_types.h"
-
-void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
-void bch2_replicas_entry_to_text(struct printbuf *,
-                                struct bch_replicas_entry_v1 *);
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
-                                struct bch_fs *, struct printbuf *);
-void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
-
-static inline struct bch_replicas_entry_v1 *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
-       return (void *) r->entries + r->entry_size * i;
-}
-
-int bch2_replicas_entry_idx(struct bch_fs *,
-                           struct bch_replicas_entry_v1 *);
-
-void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
-                             enum bch_data_type,
-                             struct bch_devs_list);
-
-bool bch2_replicas_marked_locked(struct bch_fs *,
-                         struct bch_replicas_entry_v1 *);
-bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
-int bch2_mark_replicas(struct bch_fs *,
-                      struct bch_replicas_entry_v1 *);
-
-void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
-
-static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
-                                             unsigned dev)
-{
-       e->data_type    = BCH_DATA_cached;
-       e->nr_devs      = 1;
-       e->nr_required  = 1;
-       e->devs[0]      = dev;
-}
-
-bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
-                          unsigned, bool);
-
-unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
-unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
-
-int bch2_replicas_gc_end(struct bch_fs *, int);
-int bch2_replicas_gc_start(struct bch_fs *, unsigned);
-int bch2_replicas_gc2(struct bch_fs *);
-
-#define for_each_cpu_replicas_entry(_r, _i)                            \
-       for (_i = (_r)->entries;                                        \
-            (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
-            _i = (void *) (_i) + (_r)->entry_size)
-
-/* iterate over superblock replicas - used by userspace tools: */
-
-#define replicas_entry_next(_i)                                                \
-       ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
-
-#define for_each_replicas_entry(_r, _i)                                        \
-       for (_i = (_r)->entries;                                        \
-            (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-            (_i) = replicas_entry_next(_i))
-
-#define for_each_replicas_entry_v0(_r, _i)                             \
-       for (_i = (_r)->entries;                                        \
-            (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-            (_i) = replicas_entry_next(_i))
-
-int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
-extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
-
-void bch2_fs_replicas_exit(struct bch_fs *);
-
-#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
deleted file mode 100644 (file)
index b7eff90..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_FORMAT_H
-#define _BCACHEFS_REPLICAS_FORMAT_H
-
-struct bch_replicas_entry_v0 {
-       __u8                    data_type;
-       __u8                    nr_devs;
-       __u8                    devs[] __counted_by(nr_devs);
-} __packed;
-
-struct bch_sb_field_replicas_v0 {
-       struct bch_sb_field     field;
-       struct bch_replicas_entry_v0 entries[];
-} __packed __aligned(8);
-
-struct bch_replicas_entry_v1 {
-       __u8                    data_type;
-       __u8                    nr_devs;
-       __u8                    nr_required;
-       __u8                    devs[] __counted_by(nr_devs);
-} __packed;
-
-struct bch_sb_field_replicas {
-       struct bch_sb_field     field;
-       struct bch_replicas_entry_v1 entries[];
-} __packed __aligned(8);
-
-#define replicas_entry_bytes(_i)                                       \
-       (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
-#define replicas_entry_add_dev(e, d) ({                                        \
-       (e)->nr_devs++;                                                 \
-       (e)->devs[(e)->nr_devs - 1] = (d);                              \
-})
-
-#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
deleted file mode 100644 (file)
index fed71c8..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_TYPES_H
-#define _BCACHEFS_REPLICAS_TYPES_H
-
-struct bch_replicas_cpu {
-       unsigned                nr;
-       unsigned                entry_size;
-       struct bch_replicas_entry_v1 *entries;
-};
-
-#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
deleted file mode 100644 (file)
index 59c8770..0000000
+++ /dev/null
@@ -1,340 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "error.h"
-#include "journal_io.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "super-io.h"
-
-/*
- * BCH_SB_FIELD_clean:
- *
- * Btree roots, and a few other things, are recovered from the journal after an
- * unclean shutdown - but after a clean shutdown, to avoid having to read the
- * journal, we can store them in the superblock.
- *
- * bch_sb_field_clean simply contains a list of journal entries, stored exactly
- * as they would be in the journal:
- */
-
-int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
-                               int write)
-{
-       struct bkey_validate_context from = {
-               .flags          = write,
-               .from           = BKEY_VALIDATE_superblock,
-       };
-       struct jset_entry *entry;
-       int ret;
-
-       for (entry = clean->start;
-            entry < (struct jset_entry *) vstruct_end(&clean->field);
-            entry = vstruct_next(entry)) {
-               if (vstruct_end(entry) > vstruct_end(&clean->field)) {
-                       bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu",
-                               le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s),
-                               (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field));
-                       bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun);
-                       return -BCH_ERR_fsck_repair_unimplemented;
-               }
-
-               ret = bch2_journal_entry_validate(c, NULL, entry,
-                                                 le16_to_cpu(c->disk_sb.sb->version),
-                                                 BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
-                                                 from);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static struct bkey_i *btree_root_find(struct bch_fs *c,
-                                     struct bch_sb_field_clean *clean,
-                                     struct jset *j,
-                                     enum btree_id id, unsigned *level)
-{
-       struct bkey_i *k;
-       struct jset_entry *entry, *start, *end;
-
-       if (clean) {
-               start = clean->start;
-               end = vstruct_end(&clean->field);
-       } else {
-               start = j->start;
-               end = vstruct_last(j);
-       }
-
-       for (entry = start; entry < end; entry = vstruct_next(entry))
-               if (entry->type == BCH_JSET_ENTRY_btree_root &&
-                   entry->btree_id == id)
-                       goto found;
-
-       return NULL;
-found:
-       if (!entry->u64s)
-               return ERR_PTR(-EINVAL);
-
-       k = entry->start;
-       *level = entry->level;
-       return k;
-}
-
-int bch2_verify_superblock_clean(struct bch_fs *c,
-                                struct bch_sb_field_clean **cleanp,
-                                struct jset *j)
-{
-       unsigned i;
-       struct bch_sb_field_clean *clean = *cleanp;
-       struct printbuf buf1 = PRINTBUF;
-       struct printbuf buf2 = PRINTBUF;
-       int ret = 0;
-
-       if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-                       sb_clean_journal_seq_mismatch,
-                       "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-                       le64_to_cpu(clean->journal_seq),
-                       le64_to_cpu(j->seq))) {
-               kfree(clean);
-               *cleanp = NULL;
-               return 0;
-       }
-
-       for (i = 0; i < BTREE_ID_NR; i++) {
-               struct bkey_i *k1, *k2;
-               unsigned l1 = 0, l2 = 0;
-
-               k1 = btree_root_find(c, clean, NULL, i, &l1);
-               k2 = btree_root_find(c, NULL, j, i, &l2);
-
-               if (!k1 && !k2)
-                       continue;
-
-               printbuf_reset(&buf1);
-               printbuf_reset(&buf2);
-
-               if (k1)
-                       bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
-               else
-                       prt_printf(&buf1, "(none)");
-
-               if (k2)
-                       bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
-               else
-                       prt_printf(&buf2, "(none)");
-
-               mustfix_fsck_err_on(!k1 || !k2 ||
-                                   IS_ERR(k1) ||
-                                   IS_ERR(k2) ||
-                                   k1->k.u64s != k2->k.u64s ||
-                                   memcmp(k1, k2, bkey_bytes(&k1->k)) ||
-                                   l1 != l2, c,
-                       sb_clean_btree_root_mismatch,
-                       "superblock btree root %u doesn't match journal after clean shutdown\n"
-                       "sb:      l=%u %s\n"
-                       "journal: l=%u %s\n", i,
-                       l1, buf1.buf,
-                       l2, buf2.buf);
-       }
-fsck_err:
-       printbuf_exit(&buf2);
-       printbuf_exit(&buf1);
-       return ret;
-}
-
-struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
-{
-       struct bch_sb_field_clean *clean, *sb_clean;
-       int ret;
-
-       mutex_lock(&c->sb_lock);
-       sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
-
-       if (fsck_err_on(!sb_clean, c,
-                       sb_clean_missing,
-                       "superblock marked clean but clean section not present")) {
-               SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-               c->sb.clean = false;
-               mutex_unlock(&c->sb_lock);
-               return ERR_PTR(-BCH_ERR_invalid_sb_clean);
-       }
-
-       clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-                       GFP_KERNEL);
-       if (!clean) {
-               mutex_unlock(&c->sb_lock);
-               return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
-       }
-
-       ret = bch2_sb_clean_validate_late(c, clean, READ);
-       if (ret) {
-               kfree(clean);
-               mutex_unlock(&c->sb_lock);
-               return ERR_PTR(ret);
-       }
-
-       mutex_unlock(&c->sb_lock);
-
-       return clean;
-fsck_err:
-       mutex_unlock(&c->sb_lock);
-       return ERR_PTR(ret);
-}
-
-void bch2_journal_super_entries_add_common(struct bch_fs *c,
-                                          struct jset_entry **end,
-                                          u64 journal_seq)
-{
-       {
-               struct jset_entry_usage *u =
-                       container_of(jset_entry_init(end, sizeof(*u)),
-                                    struct jset_entry_usage, entry);
-
-               u->entry.type   = BCH_JSET_ENTRY_usage;
-               u->entry.btree_id = BCH_FS_USAGE_key_version;
-               u->v            = cpu_to_le64(atomic64_read(&c->key_version));
-       }
-
-       for (unsigned i = 0; i < 2; i++) {
-               struct jset_entry_clock *clock =
-                       container_of(jset_entry_init(end, sizeof(*clock)),
-                                    struct jset_entry_clock, entry);
-
-               clock->entry.type = BCH_JSET_ENTRY_clock;
-               clock->rw       = i;
-               clock->time     = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
-       }
-}
-
-static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                 enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_clean *clean = field_to_type(f, clean);
-
-       if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
-               prt_printf(err, "wrong size (got %zu should be %zu)",
-                      vstruct_bytes(&clean->field), sizeof(*clean));
-               return -BCH_ERR_invalid_sb_clean;
-       }
-
-       for (struct jset_entry *entry = clean->start;
-            entry != vstruct_end(&clean->field);
-            entry = vstruct_next(entry)) {
-               if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) {
-                       prt_str(err, "entry type ");
-                       bch2_prt_jset_entry_type(err, entry->type);
-                       prt_str(err, " overruns end of section");
-                       return -BCH_ERR_invalid_sb_clean;
-               }
-       }
-
-       return 0;
-}
-
-static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
-                                 struct bch_sb_field *f)
-{
-       struct bch_sb_field_clean *clean = field_to_type(f, clean);
-       struct jset_entry *entry;
-
-       prt_printf(out, "flags:          %x\n",         le32_to_cpu(clean->flags));
-       prt_printf(out, "journal_seq:    %llu\n",       le64_to_cpu(clean->journal_seq));
-
-       for (entry = clean->start;
-            entry != vstruct_end(&clean->field);
-            entry = vstruct_next(entry)) {
-               if ((void *) vstruct_next(entry) > vstruct_end(&clean->field))
-                       break;
-
-               if (entry->type == BCH_JSET_ENTRY_btree_keys &&
-                   !entry->u64s)
-                       continue;
-
-               bch2_journal_entry_to_text(out, NULL, entry);
-               prt_newline(out);
-       }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_clean = {
-       .validate       = bch2_sb_clean_validate,
-       .to_text        = bch2_sb_clean_to_text,
-};
-
-int bch2_fs_mark_dirty(struct bch_fs *c)
-{
-       int ret;
-
-       /*
-        * Unconditionally write superblock, to verify it hasn't changed before
-        * we go rw:
-        */
-
-       mutex_lock(&c->sb_lock);
-       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-       c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-
-       ret = bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       return ret;
-}
-
-void bch2_fs_mark_clean(struct bch_fs *c)
-{
-       struct bch_sb_field_clean *sb_clean;
-       struct jset_entry *entry;
-       unsigned u64s;
-       int ret;
-
-       mutex_lock(&c->sb_lock);
-       if (BCH_SB_CLEAN(c->disk_sb.sb))
-               goto out;
-
-       SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
-
-       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
-       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
-       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
-       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
-
-       u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
-
-       sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s);
-       if (!sb_clean) {
-               bch_err(c, "error resizing superblock while setting filesystem clean");
-               goto out;
-       }
-
-       sb_clean->flags         = 0;
-       sb_clean->journal_seq   = cpu_to_le64(atomic64_read(&c->journal.seq));
-
-       /* Trying to catch outstanding bug: */
-       BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
-
-       entry = sb_clean->start;
-       bch2_journal_super_entries_add_common(c, &entry, 0);
-       entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
-       BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-
-       memset(entry, 0,
-              vstruct_end(&sb_clean->field) - (void *) entry);
-
-       /*
-        * this should be in the write path, and we should be validating every
-        * superblock section:
-        */
-       ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
-       if (ret) {
-               bch_err(c, "error writing marking filesystem clean: validate error");
-               goto out;
-       }
-
-       bch2_journal_pos_from_member_info_set(c);
-
-       bch2_write_super(c);
-out:
-       mutex_unlock(&c->sb_lock);
-}
diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h
deleted file mode 100644 (file)
index 71caef2..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_CLEAN_H
-#define _BCACHEFS_SB_CLEAN_H
-
-int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
-int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
-                                struct jset *);
-struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
-void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
-
-int bch2_fs_mark_dirty(struct bch_fs *);
-void bch2_fs_mark_clean(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
deleted file mode 100644 (file)
index 2b4b844..0000000
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "super-io.h"
-#include "sb-counters.h"
-
-/* BCH_SB_FIELD_counters */
-
-static const u8 counters_to_stable_map[] = {
-#define x(n, id, ...)  [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n,
-       BCH_PERSISTENT_COUNTERS()
-#undef x
-};
-
-const char * const bch2_counter_names[] = {
-#define x(t, n, ...) (#t),
-       BCH_PERSISTENT_COUNTERS()
-#undef x
-       NULL
-};
-
-static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
-{
-       if (!ctrs)
-               return 0;
-
-       return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
-}
-
-static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum bch_validate_flags flags, struct printbuf *err)
-{
-       return 0;
-}
-
-static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
-                             struct bch_sb_field *f)
-{
-       struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
-       unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
-       for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
-               unsigned stable = counters_to_stable_map[i];
-               if (stable < nr)
-                       prt_printf(out, "%s \t%llu\n",
-                                  bch2_counter_names[i],
-                                  le64_to_cpu(ctrs->d[stable]));
-       }
-}
-
-int bch2_sb_counters_to_cpu(struct bch_fs *c)
-{
-       struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
-       unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
-       for (unsigned i = 0; i < BCH_COUNTER_NR; i++)
-               c->counters_on_mount[i] = 0;
-
-       for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
-               unsigned stable = counters_to_stable_map[i];
-               if (stable < nr) {
-                       u64 v = le64_to_cpu(ctrs->d[stable]);
-                       percpu_u64_set(&c->counters[i], v);
-                       c->counters_on_mount[i] = v;
-               }
-       }
-
-       return 0;
-}
-
-int bch2_sb_counters_from_cpu(struct bch_fs *c)
-{
-       struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
-       struct bch_sb_field_counters *ret;
-       unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
-       if (nr < BCH_COUNTER_NR) {
-               ret = bch2_sb_field_resize(&c->disk_sb, counters,
-                                          sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
-               if (ret) {
-                       ctrs = ret;
-                       nr = bch2_sb_counter_nr_entries(ctrs);
-               }
-       }
-
-       for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
-               unsigned stable = counters_to_stable_map[i];
-               if (stable < nr)
-                       ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
-       }
-
-       return 0;
-}
-
-void bch2_fs_counters_exit(struct bch_fs *c)
-{
-       free_percpu(c->counters);
-}
-
-int bch2_fs_counters_init(struct bch_fs *c)
-{
-       c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
-       if (!c->counters)
-               return -BCH_ERR_ENOMEM_fs_counters_init;
-
-       return bch2_sb_counters_to_cpu(c);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_counters = {
-       .validate       = bch2_sb_counters_validate,
-       .to_text        = bch2_sb_counters_to_text,
-};
-
-#ifndef NO_BCACHEFS_CHARDEV
-long bch2_ioctl_query_counters(struct bch_fs *c,
-                       struct bch_ioctl_query_counters __user *user_arg)
-{
-       struct bch_ioctl_query_counters arg;
-       int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg));
-       if (ret)
-               return ret;
-
-       if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) ||
-           arg.pad)
-               return -EINVAL;
-
-       arg.nr = min(arg.nr, BCH_COUNTER_NR);
-       ret = put_user(arg.nr, &user_arg->nr);
-       if (ret)
-               return ret;
-
-       for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
-               unsigned stable = counters_to_stable_map[i];
-
-               if (stable < arg.nr) {
-                       u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT)
-                               ? percpu_u64_get(&c->counters[i])
-                               : c->counters_on_mount[i];
-
-                       ret = put_user(v, &user_arg->d[stable]);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       return 0;
-}
-#endif
diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h
deleted file mode 100644 (file)
index a4329ad..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_COUNTERS_H
-#define _BCACHEFS_SB_COUNTERS_H
-
-#include "bcachefs.h"
-#include "super-io.h"
-
-int bch2_sb_counters_to_cpu(struct bch_fs *);
-int bch2_sb_counters_from_cpu(struct bch_fs *);
-
-void bch2_fs_counters_exit(struct bch_fs *);
-int bch2_fs_counters_init(struct bch_fs *);
-
-extern const char * const bch2_counter_names[];
-extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
-
-long bch2_ioctl_query_counters(struct bch_fs *,
-                       struct bch_ioctl_query_counters __user *);
-
-#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
deleted file mode 100644 (file)
index b868702..0000000
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
-#define _BCACHEFS_SB_COUNTERS_FORMAT_H
-
-enum counters_flags {
-       TYPE_COUNTER    = BIT(0),       /* event counters */
-       TYPE_SECTORS    = BIT(1),       /* amount counters, the unit is sectors */
-};
-
-#define BCH_PERSISTENT_COUNTERS()                                      \
-       x(io_read,                                      0,      TYPE_SECTORS)   \
-       x(io_read_inline,                               80,     TYPE_SECTORS)   \
-       x(io_read_hole,                                 81,     TYPE_SECTORS)   \
-       x(io_read_promote,                              30,     TYPE_COUNTER)   \
-       x(io_read_bounce,                               31,     TYPE_COUNTER)   \
-       x(io_read_split,                                33,     TYPE_COUNTER)   \
-       x(io_read_reuse_race,                           34,     TYPE_COUNTER)   \
-       x(io_read_retry,                                32,     TYPE_COUNTER)   \
-       x(io_read_fail_and_poison,                      82,     TYPE_COUNTER)   \
-       x(io_write,                                     1,      TYPE_SECTORS)   \
-       x(io_move,                                      2,      TYPE_SECTORS)   \
-       x(io_move_read,                                 35,     TYPE_SECTORS)   \
-       x(io_move_write,                                36,     TYPE_SECTORS)   \
-       x(io_move_finish,                               37,     TYPE_SECTORS)   \
-       x(io_move_fail,                                 38,     TYPE_COUNTER)   \
-       x(io_move_write_fail,                           82,     TYPE_COUNTER)   \
-       x(io_move_start_fail,                           39,     TYPE_COUNTER)   \
-       x(io_move_created_rebalance,                    83,     TYPE_COUNTER)   \
-       x(io_move_evacuate_bucket,                      84,     TYPE_COUNTER)   \
-       x(bucket_invalidate,                            3,      TYPE_COUNTER)   \
-       x(bucket_discard,                               4,      TYPE_COUNTER)   \
-       x(bucket_discard_fast,                          79,     TYPE_COUNTER)   \
-       x(bucket_alloc,                                 5,      TYPE_COUNTER)   \
-       x(bucket_alloc_fail,                            6,      TYPE_COUNTER)   \
-       x(btree_cache_scan,                             7,      TYPE_COUNTER)   \
-       x(btree_cache_reap,                             8,      TYPE_COUNTER)   \
-       x(btree_cache_cannibalize,                      9,      TYPE_COUNTER)   \
-       x(btree_cache_cannibalize_lock,                 10,     TYPE_COUNTER)   \
-       x(btree_cache_cannibalize_lock_fail,            11,     TYPE_COUNTER)   \
-       x(btree_cache_cannibalize_unlock,               12,     TYPE_COUNTER)   \
-       x(btree_node_write,                             13,     TYPE_COUNTER)   \
-       x(btree_node_read,                              14,     TYPE_COUNTER)   \
-       x(btree_node_compact,                           15,     TYPE_COUNTER)   \
-       x(btree_node_merge,                             16,     TYPE_COUNTER)   \
-       x(btree_node_split,                             17,     TYPE_COUNTER)   \
-       x(btree_node_rewrite,                           18,     TYPE_COUNTER)   \
-       x(btree_node_alloc,                             19,     TYPE_COUNTER)   \
-       x(btree_node_free,                              20,     TYPE_COUNTER)   \
-       x(btree_node_set_root,                          21,     TYPE_COUNTER)   \
-       x(btree_path_relock_fail,                       22,     TYPE_COUNTER)   \
-       x(btree_path_upgrade_fail,                      23,     TYPE_COUNTER)   \
-       x(btree_reserve_get_fail,                       24,     TYPE_COUNTER)   \
-       x(journal_entry_full,                           25,     TYPE_COUNTER)   \
-       x(journal_full,                                 26,     TYPE_COUNTER)   \
-       x(journal_reclaim_finish,                       27,     TYPE_COUNTER)   \
-       x(journal_reclaim_start,                        28,     TYPE_COUNTER)   \
-       x(journal_write,                                29,     TYPE_COUNTER)   \
-       x(copygc,                                       40,     TYPE_COUNTER)   \
-       x(copygc_wait,                                  41,     TYPE_COUNTER)   \
-       x(gc_gens_end,                                  42,     TYPE_COUNTER)   \
-       x(gc_gens_start,                                43,     TYPE_COUNTER)   \
-       x(trans_blocked_journal_reclaim,                44,     TYPE_COUNTER)   \
-       x(trans_restart_btree_node_reused,              45,     TYPE_COUNTER)   \
-       x(trans_restart_btree_node_split,               46,     TYPE_COUNTER)   \
-       x(trans_restart_fault_inject,                   47,     TYPE_COUNTER)   \
-       x(trans_restart_iter_upgrade,                   48,     TYPE_COUNTER)   \
-       x(trans_restart_journal_preres_get,             49,     TYPE_COUNTER)   \
-       x(trans_restart_journal_reclaim,                50,     TYPE_COUNTER)   \
-       x(trans_restart_journal_res_get,                51,     TYPE_COUNTER)   \
-       x(trans_restart_key_cache_key_realloced,        52,     TYPE_COUNTER)   \
-       x(trans_restart_key_cache_raced,                53,     TYPE_COUNTER)   \
-       x(trans_restart_mark_replicas,                  54,     TYPE_COUNTER)   \
-       x(trans_restart_mem_realloced,                  55,     TYPE_COUNTER)   \
-       x(trans_restart_memory_allocation_failure,      56,     TYPE_COUNTER)   \
-       x(trans_restart_relock,                         57,     TYPE_COUNTER)   \
-       x(trans_restart_relock_after_fill,              58,     TYPE_COUNTER)   \
-       x(trans_restart_relock_key_cache_fill,          59,     TYPE_COUNTER)   \
-       x(trans_restart_relock_next_node,               60,     TYPE_COUNTER)   \
-       x(trans_restart_relock_parent_for_fill,         61,     TYPE_COUNTER)   \
-       x(trans_restart_relock_path,                    62,     TYPE_COUNTER)   \
-       x(trans_restart_relock_path_intent,             63,     TYPE_COUNTER)   \
-       x(trans_restart_too_many_iters,                 64,     TYPE_COUNTER)   \
-       x(trans_restart_traverse,                       65,     TYPE_COUNTER)   \
-       x(trans_restart_upgrade,                        66,     TYPE_COUNTER)   \
-       x(trans_restart_would_deadlock,                 67,     TYPE_COUNTER)   \
-       x(trans_restart_would_deadlock_write,           68,     TYPE_COUNTER)   \
-       x(trans_restart_injected,                       69,     TYPE_COUNTER)   \
-       x(trans_restart_key_cache_upgrade,              70,     TYPE_COUNTER)   \
-       x(trans_traverse_all,                           71,     TYPE_COUNTER)   \
-       x(transaction_commit,                           72,     TYPE_COUNTER)   \
-       x(write_super,                                  73,     TYPE_COUNTER)   \
-       x(trans_restart_would_deadlock_recursion_limit, 74,     TYPE_COUNTER)   \
-       x(trans_restart_write_buffer_flush,             75,     TYPE_COUNTER)   \
-       x(trans_restart_split_race,                     76,     TYPE_COUNTER)   \
-       x(write_buffer_flush_slowpath,                  77,     TYPE_COUNTER)   \
-       x(write_buffer_flush_sync,                      78,     TYPE_COUNTER)
-
-enum bch_persistent_counters {
-#define x(t, n, ...) BCH_COUNTER_##t,
-       BCH_PERSISTENT_COUNTERS()
-#undef x
-       BCH_COUNTER_NR
-};
-
-enum bch_persistent_counters_stable {
-#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n,
-       BCH_PERSISTENT_COUNTERS()
-#undef x
-       BCH_COUNTER_STABLE_NR
-};
-
-struct bch_sb_field_counters {
-       struct bch_sb_field     field;
-       __le64                  d[];
-};
-
-#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
deleted file mode 100644 (file)
index 1506d05..0000000
+++ /dev/null
@@ -1,457 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Superblock section that contains a list of recovery passes to run when
- * downgrading past a given version
- */
-
-#include "bcachefs.h"
-#include "darray.h"
-#include "recovery_passes.h"
-#include "sb-downgrade.h"
-#include "sb-errors.h"
-#include "super-io.h"
-
-#define RECOVERY_PASS_ALL_FSCK         BIT_ULL(63)
-
-/*
- * Upgrade, downgrade tables - run certain recovery passes, fix certain errors
- *
- * x(version, recovery_passes, errors...)
- */
-#define UPGRADE_TABLE()                                                \
-       x(snapshot_2,                                           \
-         RECOVERY_PASS_ALL_FSCK,                               \
-         BCH_FSCK_ERR_subvol_root_wrong_bi_subvol,             \
-         BCH_FSCK_ERR_subvol_not_master_and_not_snapshot)      \
-       x(backpointers,                                         \
-         RECOVERY_PASS_ALL_FSCK)                               \
-       x(inode_v3,                                             \
-         RECOVERY_PASS_ALL_FSCK)                               \
-       x(unwritten_extents,                                    \
-         RECOVERY_PASS_ALL_FSCK)                               \
-       x(bucket_gens,                                          \
-         BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)|          \
-         RECOVERY_PASS_ALL_FSCK)                               \
-       x(lru_v2,                                               \
-         RECOVERY_PASS_ALL_FSCK)                               \
-       x(fragmentation_lru,                                    \
-         RECOVERY_PASS_ALL_FSCK)                               \
-       x(no_bps_in_alloc_keys,                                 \
-         RECOVERY_PASS_ALL_FSCK)                               \
-       x(snapshot_trees,                                       \
-         RECOVERY_PASS_ALL_FSCK)                               \
-       x(snapshot_skiplists,                                   \
-         BIT_ULL(BCH_RECOVERY_PASS_check_snapshots),           \
-         BCH_FSCK_ERR_snapshot_bad_depth,                      \
-         BCH_FSCK_ERR_snapshot_bad_skiplist)                   \
-       x(deleted_inodes,                                       \
-         BIT_ULL(BCH_RECOVERY_PASS_check_inodes),              \
-         BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list)      \
-       x(rebalance_work,                                       \
-         BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))    \
-       x(subvolume_fs_parent,                                  \
-         BIT_ULL(BCH_RECOVERY_PASS_check_dirents),             \
-         BCH_FSCK_ERR_subvol_fs_path_parent_wrong)             \
-       x(btree_subvolume_children,                             \
-         BIT_ULL(BCH_RECOVERY_PASS_check_subvols),             \
-         BCH_FSCK_ERR_subvol_children_not_set)                 \
-       x(mi_btree_bitmap,                                      \
-         BIT_ULL(BCH_RECOVERY_PASS_check_allocations),         \
-         BCH_FSCK_ERR_btree_bitmap_not_marked)                 \
-       x(disk_accounting_v2,                                   \
-         BIT_ULL(BCH_RECOVERY_PASS_check_allocations),         \
-         BCH_FSCK_ERR_bkey_version_in_future,                  \
-         BCH_FSCK_ERR_dev_usage_buckets_wrong,                 \
-         BCH_FSCK_ERR_dev_usage_sectors_wrong,                 \
-         BCH_FSCK_ERR_dev_usage_fragmented_wrong,              \
-         BCH_FSCK_ERR_accounting_mismatch)                     \
-       x(disk_accounting_v3,                                   \
-         BIT_ULL(BCH_RECOVERY_PASS_check_allocations),         \
-         BCH_FSCK_ERR_bkey_version_in_future,                  \
-         BCH_FSCK_ERR_dev_usage_buckets_wrong,                 \
-         BCH_FSCK_ERR_dev_usage_sectors_wrong,                 \
-         BCH_FSCK_ERR_dev_usage_fragmented_wrong,              \
-         BCH_FSCK_ERR_accounting_mismatch,                     \
-         BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,       \
-         BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \
-         BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted,   \
-         BCH_FSCK_ERR_accounting_key_junk_at_end)              \
-       x(disk_accounting_inum,                                 \
-         BIT_ULL(BCH_RECOVERY_PASS_check_allocations),         \
-         BCH_FSCK_ERR_accounting_mismatch)                     \
-       x(rebalance_work_acct_fix,                              \
-         BIT_ULL(BCH_RECOVERY_PASS_check_allocations),         \
-         BCH_FSCK_ERR_accounting_mismatch)                     \
-       x(inode_has_child_snapshots,                            \
-         BIT_ULL(BCH_RECOVERY_PASS_check_inodes),              \
-         BCH_FSCK_ERR_inode_has_child_snapshots_wrong)         \
-       x(backpointer_bucket_gen,                               \
-         BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
-         BCH_FSCK_ERR_backpointer_to_missing_ptr,              \
-         BCH_FSCK_ERR_ptr_to_missing_backpointer)              \
-       x(disk_accounting_big_endian,                           \
-         BIT_ULL(BCH_RECOVERY_PASS_check_allocations),         \
-         BCH_FSCK_ERR_accounting_mismatch,                     \
-         BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,       \
-         BCH_FSCK_ERR_accounting_key_junk_at_end)              \
-       x(cached_backpointers,                                  \
-         BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
-         BCH_FSCK_ERR_ptr_to_missing_backpointer)              \
-       x(stripe_backpointers,                                  \
-         BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
-         BCH_FSCK_ERR_ptr_to_missing_backpointer)              \
-       x(inode_has_case_insensitive,                           \
-         BIT_ULL(BCH_RECOVERY_PASS_check_inodes),              \
-         BCH_FSCK_ERR_inode_has_case_insensitive_not_set,      \
-         BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)
-
-#define DOWNGRADE_TABLE()                                      \
-       x(bucket_stripe_sectors,                                \
-         0)                                                    \
-       x(disk_accounting_v2,                                   \
-         BIT_ULL(BCH_RECOVERY_PASS_check_allocations),         \
-         BCH_FSCK_ERR_dev_usage_buckets_wrong,                 \
-         BCH_FSCK_ERR_dev_usage_sectors_wrong,                 \
-         BCH_FSCK_ERR_dev_usage_fragmented_wrong,              \
-         BCH_FSCK_ERR_fs_usage_hidden_wrong,                   \
-         BCH_FSCK_ERR_fs_usage_btree_wrong,                    \
-         BCH_FSCK_ERR_fs_usage_data_wrong,                     \
-         BCH_FSCK_ERR_fs_usage_cached_wrong,                   \
-         BCH_FSCK_ERR_fs_usage_reserved_wrong,                 \
-         BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,                \
-         BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,      \
-         BCH_FSCK_ERR_fs_usage_replicas_wrong,                 \
-         BCH_FSCK_ERR_bkey_version_in_future)                  \
-       x(disk_accounting_v3,                                   \
-         BIT_ULL(BCH_RECOVERY_PASS_check_allocations),         \
-         BCH_FSCK_ERR_dev_usage_buckets_wrong,                 \
-         BCH_FSCK_ERR_dev_usage_sectors_wrong,                 \
-         BCH_FSCK_ERR_dev_usage_fragmented_wrong,              \
-         BCH_FSCK_ERR_fs_usage_hidden_wrong,                   \
-         BCH_FSCK_ERR_fs_usage_btree_wrong,                    \
-         BCH_FSCK_ERR_fs_usage_data_wrong,                     \
-         BCH_FSCK_ERR_fs_usage_cached_wrong,                   \
-         BCH_FSCK_ERR_fs_usage_reserved_wrong,                 \
-         BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,                \
-         BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,      \
-         BCH_FSCK_ERR_fs_usage_replicas_wrong,                 \
-         BCH_FSCK_ERR_accounting_replicas_not_marked,          \
-         BCH_FSCK_ERR_bkey_version_in_future)                  \
-       x(rebalance_work_acct_fix,                              \
-         BIT_ULL(BCH_RECOVERY_PASS_check_allocations),         \
-         BCH_FSCK_ERR_accounting_mismatch,                     \
-         BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,       \
-         BCH_FSCK_ERR_accounting_key_junk_at_end)              \
-       x(backpointer_bucket_gen,                               \
-         BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
-         BCH_FSCK_ERR_backpointer_bucket_offset_wrong,         \
-         BCH_FSCK_ERR_backpointer_to_missing_ptr,              \
-         BCH_FSCK_ERR_ptr_to_missing_backpointer)              \
-       x(disk_accounting_big_endian,                           \
-         BIT_ULL(BCH_RECOVERY_PASS_check_allocations),         \
-         BCH_FSCK_ERR_accounting_mismatch,                     \
-         BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,       \
-         BCH_FSCK_ERR_accounting_key_junk_at_end)
-
-struct upgrade_downgrade_entry {
-       u64             recovery_passes;
-       u16             version;
-       u16             nr_errors;
-       const u16       *errors;
-};
-
-#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ };
-UPGRADE_TABLE()
-#undef x
-
-static const struct upgrade_downgrade_entry upgrade_table[] = {
-#define x(ver, passes, ...) {                                  \
-       .recovery_passes        = passes,                       \
-       .version                = bcachefs_metadata_version_##ver,\
-       .nr_errors              = ARRAY_SIZE(upgrade_##ver##_errors),   \
-       .errors                 = upgrade_##ver##_errors,       \
-},
-UPGRADE_TABLE()
-#undef x
-};
-
-static int have_stripes(struct bch_fs *c)
-{
-       if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
-               return 0;
-
-       return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
-}
-
-int bch2_sb_set_upgrade_extra(struct bch_fs *c)
-{
-       unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
-       unsigned new_version = c->sb.version;
-       bool write_sb = false;
-       int ret = 0;
-
-       mutex_lock(&c->sb_lock);
-       struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
-       if (old_version <  bcachefs_metadata_version_bucket_stripe_sectors &&
-           new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
-           (ret = have_stripes(c) > 0)) {
-               __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
-               __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
-               __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
-               write_sb = true;
-       }
-
-       if (write_sb)
-               bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       return ret < 0 ? ret : 0;
-}
-
-void bch2_sb_set_upgrade(struct bch_fs *c,
-                        unsigned old_version,
-                        unsigned new_version)
-{
-       lockdep_assert_held(&c->sb_lock);
-
-       struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
-       for (const struct upgrade_downgrade_entry *i = upgrade_table;
-            i < upgrade_table + ARRAY_SIZE(upgrade_table);
-            i++)
-               if (i->version > old_version && i->version <= new_version) {
-                       u64 passes = i->recovery_passes;
-
-                       if (passes & RECOVERY_PASS_ALL_FSCK)
-                               passes |= bch2_fsck_recovery_passes();
-                       passes &= ~RECOVERY_PASS_ALL_FSCK;
-
-                       ext->recovery_passes_required[0] |=
-                               cpu_to_le64(bch2_recovery_passes_to_stable(passes));
-
-                       for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
-                               __set_bit_le64(*e, ext->errors_silent);
-               }
-}
-
-#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
-DOWNGRADE_TABLE()
-#undef x
-
-static const struct upgrade_downgrade_entry downgrade_table[] = {
-#define x(ver, passes, ...) {                                  \
-       .recovery_passes        = passes,                       \
-       .version                = bcachefs_metadata_version_##ver,\
-       .nr_errors              = ARRAY_SIZE(downgrade_##ver##_errors), \
-       .errors                 = downgrade_##ver##_errors,     \
-},
-DOWNGRADE_TABLE()
-#undef x
-};
-
-static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
-{
-       unsigned dst_offset = table->nr;
-       struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
-       unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
-       int ret = 0;
-
-       unsigned nr_errors = le16_to_cpu(dst->nr_errors);
-
-       switch (le16_to_cpu(dst->version)) {
-       case bcachefs_metadata_version_bucket_stripe_sectors:
-               if (have_stripes(c)) {
-                       bytes += sizeof(dst->errors[0]) * 2;
-
-                       ret = darray_make_room(table, bytes);
-                       if (ret)
-                               return ret;
-
-                       dst = (void *) &table->data[dst_offset];
-                       dst->nr_errors = cpu_to_le16(nr_errors + 1);
-
-                       /* open coded __set_bit_le64, as dst is packed and
-                        * dst->recovery_passes is misaligned */
-                       unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
-                       dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64));
-
-                       dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
-               }
-               break;
-       }
-
-       return ret;
-}
-
-static inline const struct bch_sb_field_downgrade_entry *
-downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
-{
-       return (void *) &e->errors[le16_to_cpu(e->nr_errors)];
-}
-
-#define for_each_downgrade_entry(_d, _i)                                               \
-       for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries;             \
-            (void *) _i        < vstruct_end(&(_d)->field) &&                          \
-            (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) &&                    \
-            (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field);          \
-            _i = downgrade_entry_next_c(_i))
-
-static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                     enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
-
-       for (const struct bch_sb_field_downgrade_entry *i = e->entries;
-            (void *) i < vstruct_end(&e->field);
-            i = downgrade_entry_next_c(i)) {
-               /*
-                * Careful: sb_field_downgrade_entry is only 2 byte aligned, but
-                * section sizes are 8 byte aligned - an empty entry spanning
-                * the end of the section is allowed (and ignored):
-                */
-               if ((void *) &i->errors[0] > vstruct_end(&e->field))
-                       break;
-
-               if (flags & BCH_VALIDATE_write &&
-                   (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
-                       prt_printf(err, "downgrade entry overruns end of superblock section");
-                       return -BCH_ERR_invalid_sb_downgrade;
-               }
-
-               if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
-                   BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
-                       prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
-                                  BCH_VERSION_MAJOR(le16_to_cpu(i->version)),
-                                  BCH_VERSION_MAJOR(le16_to_cpu(sb->version)));
-                       return -BCH_ERR_invalid_sb_downgrade;
-               }
-       }
-
-       return 0;
-}
-
-static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
-                                     struct bch_sb_field *f)
-{
-       struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
-
-       if (out->nr_tabstops <= 1)
-               printbuf_tabstop_push(out, 16);
-
-       for_each_downgrade_entry(e, i) {
-               prt_str(out, "version:\t");
-               bch2_version_to_text(out, le16_to_cpu(i->version));
-               prt_newline(out);
-
-               prt_str(out, "recovery passes:\t");
-               prt_bitflags(out, bch2_recovery_passes,
-                            bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
-               prt_newline(out);
-
-               prt_str(out, "errors:\t");
-               bool first = true;
-               for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
-                       if (!first)
-                               prt_char(out, ',');
-                       first = false;
-                       bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
-               }
-               prt_newline(out);
-       }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
-       .validate       = bch2_sb_downgrade_validate,
-       .to_text        = bch2_sb_downgrade_to_text,
-};
-
-int bch2_sb_downgrade_update(struct bch_fs *c)
-{
-       if (!test_bit(BCH_FS_btree_running, &c->flags))
-               return 0;
-
-       darray_char table = {};
-       int ret = 0;
-
-       for (const struct upgrade_downgrade_entry *src = downgrade_table;
-            src < downgrade_table + ARRAY_SIZE(downgrade_table);
-            src++) {
-               if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
-                       continue;
-
-               if (src->version < c->sb.version_incompat)
-                       continue;
-
-               struct bch_sb_field_downgrade_entry *dst;
-               unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
-
-               ret = darray_make_room(&table, bytes);
-               if (ret)
-                       goto out;
-
-               dst = (void *) &darray_top(table);
-               dst->version = cpu_to_le16(src->version);
-               dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
-               dst->recovery_passes[1] = 0;
-               dst->nr_errors          = cpu_to_le16(src->nr_errors);
-               for (unsigned i = 0; i < src->nr_errors; i++)
-                       dst->errors[i] = cpu_to_le16(src->errors[i]);
-
-               ret = downgrade_table_extra(c, &table);
-               if (ret)
-                       goto out;
-
-               if (!dst->recovery_passes[0] &&
-                   !dst->recovery_passes[1] &&
-                   !dst->nr_errors)
-                       continue;
-
-               table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
-       }
-
-       struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
-
-       unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64));
-
-       if (d && le32_to_cpu(d->field.u64s) > sb_u64s)
-               goto out;
-
-       d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
-       if (!d) {
-               ret = bch_err_throw(c, ENOSPC_sb_downgrade);
-               goto out;
-       }
-
-       memcpy(d->entries, table.data, table.nr);
-       memset_u64s_tail(d->entries, 0, table.nr);
-out:
-       darray_exit(&table);
-       return ret;
-}
-
-void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor)
-{
-       struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
-       if (!d)
-               return;
-
-       struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
-       for_each_downgrade_entry(d, i) {
-               unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version));
-               if (new_minor < minor && minor <= old_minor) {
-                       ext->recovery_passes_required[0] |= i->recovery_passes[0];
-                       ext->recovery_passes_required[1] |= i->recovery_passes[1];
-
-                       for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
-                               unsigned e = le16_to_cpu(i->errors[j]);
-                               if (e < BCH_FSCK_ERR_MAX)
-                                       __set_bit(e, c->sb.errors_silent);
-                               if (e < sizeof(ext->errors_silent) * 8)
-                                       __set_bit_le64(e, ext->errors_silent);
-                       }
-               }
-       }
-}
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
deleted file mode 100644 (file)
index 095b7cc..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_DOWNGRADE_H
-#define _BCACHEFS_SB_DOWNGRADE_H
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
-
-int bch2_sb_downgrade_update(struct bch_fs *);
-void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
-int bch2_sb_set_upgrade_extra(struct bch_fs *);
-void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
-
-#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h
deleted file mode 100644 (file)
index cffd932..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H
-#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H
-
-struct bch_sb_field_downgrade_entry {
-       __le16                  version;
-       __le64                  recovery_passes[2];
-       __le16                  nr_errors;
-       __le16                  errors[] __counted_by(nr_errors);
-} __packed __aligned(2);
-
-struct bch_sb_field_downgrade {
-       struct bch_sb_field     field;
-       struct bch_sb_field_downgrade_entry entries[];
-};
-
-#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
deleted file mode 100644 (file)
index 48853ef..0000000
+++ /dev/null
@@ -1,198 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "sb-errors.h"
-#include "super-io.h"
-
-const char * const bch2_sb_error_strs[] = {
-#define x(t, n, ...) [n] = #t,
-       BCH_SB_ERRS()
-#undef x
-};
-
-void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
-{
-       if (id < BCH_FSCK_ERR_MAX)
-               prt_str(out, bch2_sb_error_strs[id]);
-       else
-               prt_printf(out, "(unknown error %u)", id);
-}
-
-static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
-{
-       return bch2_sb_field_nr_entries(e);
-}
-
-static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
-{
-       return (sizeof(struct bch_sb_field_errors) +
-               sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
-}
-
-static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                  enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_errors *e = field_to_type(f, errors);
-       unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
-
-       for (i = 0; i < nr; i++) {
-               if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
-                       prt_printf(err, "entry with count 0 (id ");
-                       bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
-                       prt_printf(err, ")");
-                       return -BCH_ERR_invalid_sb_errors;
-               }
-
-               if (i + 1 < nr &&
-                   BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
-                   BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
-                       prt_printf(err, "entries out of order");
-                       return -BCH_ERR_invalid_sb_errors;
-               }
-       }
-
-       return 0;
-}
-
-static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
-                                  struct bch_sb_field *f)
-{
-       struct bch_sb_field_errors *e = field_to_type(f, errors);
-       unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
-
-       if (out->nr_tabstops <= 1)
-               printbuf_tabstop_push(out, 16);
-
-       for (i = 0; i < nr; i++) {
-               bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
-               prt_tab(out);
-               prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
-               prt_tab(out);
-               bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
-               prt_newline(out);
-       }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_errors = {
-       .validate       = bch2_sb_errors_validate,
-       .to_text        = bch2_sb_errors_to_text,
-};
-
-void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       if (out->nr_tabstops < 1)
-               printbuf_tabstop_push(out, 48);
-       if (out->nr_tabstops < 2)
-               printbuf_tabstop_push(out, 8);
-       if (out->nr_tabstops < 3)
-               printbuf_tabstop_push(out, 16);
-
-       guard(mutex)(&c->fsck_error_counts_lock);
-
-       bch_sb_errors_cpu *e = &c->fsck_error_counts;
-       darray_for_each(*e, i) {
-               bch2_sb_error_id_to_text(out, i->id);
-               prt_tab(out);
-               prt_u64(out, i->nr);
-               prt_tab(out);
-               bch2_prt_datetime(out, i->last_error_time);
-               prt_newline(out);
-       }
-}
-
-void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
-{
-       bch_sb_errors_cpu *e = &c->fsck_error_counts;
-       struct bch_sb_error_entry_cpu n = {
-               .id = err,
-               .nr = 1,
-               .last_error_time = ktime_get_real_seconds()
-       };
-       unsigned i;
-
-       mutex_lock(&c->fsck_error_counts_lock);
-       for (i = 0; i < e->nr; i++) {
-               if (err == e->data[i].id) {
-                       e->data[i].nr++;
-                       e->data[i].last_error_time = n.last_error_time;
-                       goto out;
-               }
-               if (err < e->data[i].id)
-                       break;
-       }
-
-       if (darray_make_room(e, 1))
-               goto out;
-
-       darray_insert_item(e, i, n);
-out:
-       mutex_unlock(&c->fsck_error_counts_lock);
-}
-
-void bch2_sb_errors_from_cpu(struct bch_fs *c)
-{
-       bch_sb_errors_cpu *src = &c->fsck_error_counts;
-       struct bch_sb_field_errors *dst;
-       unsigned i;
-
-       mutex_lock(&c->fsck_error_counts_lock);
-
-       dst = bch2_sb_field_resize(&c->disk_sb, errors,
-                                  bch2_sb_field_errors_u64s(src->nr));
-
-       if (!dst)
-               goto err;
-
-       for (i = 0; i < src->nr; i++) {
-               SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
-               SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
-               dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
-       }
-
-err:
-       mutex_unlock(&c->fsck_error_counts_lock);
-}
-
-static int bch2_sb_errors_to_cpu(struct bch_fs *c)
-{
-       struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
-       bch_sb_errors_cpu *dst = &c->fsck_error_counts;
-       unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
-       int ret;
-
-       if (!nr)
-               return 0;
-
-       mutex_lock(&c->fsck_error_counts_lock);
-       ret = darray_make_room(dst, nr);
-       if (ret)
-               goto err;
-
-       dst->nr = nr;
-
-       for (i = 0; i < nr; i++) {
-               dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
-               dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
-               dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
-       }
-err:
-       mutex_unlock(&c->fsck_error_counts_lock);
-
-       return ret;
-}
-
-void bch2_fs_sb_errors_exit(struct bch_fs *c)
-{
-       darray_exit(&c->fsck_error_counts);
-}
-
-void bch2_fs_sb_errors_init_early(struct bch_fs *c)
-{
-       mutex_init(&c->fsck_error_counts_lock);
-       darray_init(&c->fsck_error_counts);
-}
-
-int bch2_fs_sb_errors_init(struct bch_fs *c)
-{
-       return bch2_sb_errors_to_cpu(c);
-}
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
deleted file mode 100644 (file)
index e862672..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_H
-#define _BCACHEFS_SB_ERRORS_H
-
-#include "sb-errors_types.h"
-
-extern const char * const bch2_sb_error_strs[];
-
-void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
-void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
-
-void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
-
-void bch2_sb_errors_from_cpu(struct bch_fs *);
-
-void bch2_fs_sb_errors_exit(struct bch_fs *);
-void bch2_fs_sb_errors_init_early(struct bch_fs *);
-int bch2_fs_sb_errors_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_ERRORS_H */
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
deleted file mode 100644 (file)
index d154b76..0000000
+++ /dev/null
@@ -1,353 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H
-#define _BCACHEFS_SB_ERRORS_FORMAT_H
-
-enum bch_fsck_flags {
-       FSCK_CAN_FIX            = BIT(0),
-       FSCK_CAN_IGNORE         = BIT(1),
-       FSCK_AUTOFIX            = BIT(2),
-       FSCK_ERR_NO_LOG         = BIT(3),
-};
-
-#define BCH_SB_ERRS()                                                                  \
-       x(clean_but_journal_not_empty,                            0,    0)              \
-       x(dirty_but_no_journal_entries,                           1,    0)              \
-       x(dirty_but_no_journal_entries_post_drop_nonflushes,      2,    0)              \
-       x(sb_clean_journal_seq_mismatch,                          3,    0)              \
-       x(sb_clean_btree_root_mismatch,                           4,    0)              \
-       x(sb_clean_missing,                                       5,    0)              \
-       x(jset_unsupported_version,                               6,    0)              \
-       x(jset_unknown_csum,                                      7,    0)              \
-       x(jset_last_seq_newer_than_seq,                           8,    0)              \
-       x(jset_past_bucket_end,                                   9,    0)              \
-       x(jset_seq_blacklisted,                                  10,    0)              \
-       x(journal_entries_missing,                               11,    0)              \
-       x(journal_entry_replicas_not_marked,                     12,    FSCK_AUTOFIX)   \
-       x(journal_entry_past_jset_end,                           13,    0)              \
-       x(journal_entry_replicas_data_mismatch,                  14,    0)              \
-       x(journal_entry_bkey_u64s_0,                             15,    0)              \
-       x(journal_entry_bkey_past_end,                           16,    0)              \
-       x(journal_entry_bkey_bad_format,                         17,    0)              \
-       x(journal_entry_bkey_invalid,                            18,    0)              \
-       x(journal_entry_btree_root_bad_size,                     19,    0)              \
-       x(journal_entry_blacklist_bad_size,                      20,    0)              \
-       x(journal_entry_blacklist_v2_bad_size,                   21,    0)              \
-       x(journal_entry_blacklist_v2_start_past_end,             22,    0)              \
-       x(journal_entry_usage_bad_size,                          23,    0)              \
-       x(journal_entry_data_usage_bad_size,                     24,    0)              \
-       x(journal_entry_clock_bad_size,                          25,    0)              \
-       x(journal_entry_clock_bad_rw,                            26,    0)              \
-       x(journal_entry_dev_usage_bad_size,                      27,    0)              \
-       x(journal_entry_dev_usage_bad_dev,                       28,    0)              \
-       x(journal_entry_dev_usage_bad_pad,                       29,    0)              \
-       x(btree_node_unreadable,                                 30,    0)              \
-       x(btree_node_fault_injected,                             31,    0)              \
-       x(btree_node_bad_magic,                                  32,    0)              \
-       x(btree_node_bad_seq,                                    33,    0)              \
-       x(btree_node_unsupported_version,                        34,    0)              \
-       x(btree_node_bset_older_than_sb_min,                     35,    0)              \
-       x(btree_node_bset_newer_than_sb,                         36,    0)              \
-       x(btree_node_data_missing,                               37,    FSCK_AUTOFIX)   \
-       x(btree_node_bset_after_end,                             38,    0)              \
-       x(btree_node_replicas_sectors_written_mismatch,          39,    0)              \
-       x(btree_node_replicas_data_mismatch,                     40,    0)              \
-       x(bset_unknown_csum,                                     41,    0)              \
-       x(bset_bad_csum,                                         42,    0)              \
-       x(bset_past_end_of_btree_node,                           43,    0)              \
-       x(bset_wrong_sector_offset,                              44,    0)              \
-       x(bset_empty,                                            45,    0)              \
-       x(bset_bad_seq,                                          46,    0)              \
-       x(bset_blacklisted_journal_seq,                          47,    FSCK_AUTOFIX)   \
-       x(first_bset_blacklisted_journal_seq,                    48,    FSCK_AUTOFIX)   \
-       x(btree_node_bad_btree,                                  49,    0)              \
-       x(btree_node_bad_level,                                  50,    0)              \
-       x(btree_node_bad_min_key,                                51,    0)              \
-       x(btree_node_bad_max_key,                                52,    0)              \
-       x(btree_node_bad_format,                                 53,    0)              \
-       x(btree_node_bkey_past_bset_end,                         54,    0)              \
-       x(btree_node_bkey_bad_format,                            55,    0)              \
-       x(btree_node_bad_bkey,                                   56,    0)              \
-       x(btree_node_bkey_out_of_order,                          57,    FSCK_AUTOFIX)   \
-       x(btree_root_bkey_invalid,                               58,    FSCK_AUTOFIX)   \
-       x(btree_root_read_error,                                 59,    FSCK_AUTOFIX)   \
-       x(btree_root_bad_min_key,                                60,    0)              \
-       x(btree_root_bad_max_key,                                61,    0)              \
-       x(btree_node_read_error,                                 62,    FSCK_AUTOFIX)   \
-       x(btree_node_topology_bad_min_key,                       63,    FSCK_AUTOFIX)   \
-       x(btree_node_topology_bad_max_key,                       64,    FSCK_AUTOFIX)   \
-       x(btree_node_topology_overwritten_by_prev_node,          65,    FSCK_AUTOFIX)   \
-       x(btree_node_topology_overwritten_by_next_node,          66,    FSCK_AUTOFIX)   \
-       x(btree_node_topology_interior_node_empty,               67,    FSCK_AUTOFIX)   \
-       x(fs_usage_hidden_wrong,                                 68,    FSCK_AUTOFIX)   \
-       x(fs_usage_btree_wrong,                                  69,    FSCK_AUTOFIX)   \
-       x(fs_usage_data_wrong,                                   70,    FSCK_AUTOFIX)   \
-       x(fs_usage_cached_wrong,                                 71,    FSCK_AUTOFIX)   \
-       x(fs_usage_reserved_wrong,                               72,    FSCK_AUTOFIX)   \
-       x(fs_usage_persistent_reserved_wrong,                    73,    FSCK_AUTOFIX)   \
-       x(fs_usage_nr_inodes_wrong,                              74,    FSCK_AUTOFIX)   \
-       x(fs_usage_replicas_wrong,                               75,    FSCK_AUTOFIX)   \
-       x(dev_usage_buckets_wrong,                               76,    FSCK_AUTOFIX)   \
-       x(dev_usage_sectors_wrong,                               77,    FSCK_AUTOFIX)   \
-       x(dev_usage_fragmented_wrong,                            78,    FSCK_AUTOFIX)   \
-       x(dev_usage_buckets_ec_wrong,                            79,    FSCK_AUTOFIX)   \
-       x(bkey_version_in_future,                                80,    0)              \
-       x(bkey_u64s_too_small,                                   81,    0)              \
-       x(bkey_invalid_type_for_btree,                           82,    0)              \
-       x(bkey_extent_size_zero,                                 83,    0)              \
-       x(bkey_extent_size_greater_than_offset,                  84,    0)              \
-       x(bkey_size_nonzero,                                     85,    0)              \
-       x(bkey_snapshot_nonzero,                                 86,    0)              \
-       x(bkey_snapshot_zero,                                    87,    0)              \
-       x(bkey_at_pos_max,                                       88,    0)              \
-       x(bkey_before_start_of_btree_node,                       89,    0)              \
-       x(bkey_after_end_of_btree_node,                          90,    0)              \
-       x(bkey_val_size_nonzero,                                 91,    0)              \
-       x(bkey_val_size_too_small,                               92,    0)              \
-       x(alloc_v1_val_size_bad,                                 93,    0)              \
-       x(alloc_v2_unpack_error,                                 94,    0)              \
-       x(alloc_v3_unpack_error,                                 95,    0)              \
-       x(alloc_v4_val_size_bad,                                 96,    0)              \
-       x(alloc_v4_backpointers_start_bad,                       97,    0)              \
-       x(alloc_key_data_type_bad,                               98,    0)              \
-       x(alloc_key_empty_but_have_data,                         99,    0)              \
-       x(alloc_key_dirty_sectors_0,                            100,    0)              \
-       x(alloc_key_data_type_inconsistency,                    101,    0)              \
-       x(alloc_key_to_missing_dev_bucket,                      102,    0)              \
-       x(alloc_key_cached_inconsistency,                       103,    0)              \
-       x(alloc_key_cached_but_read_time_zero,                  104,    FSCK_AUTOFIX)   \
-       x(alloc_key_to_missing_lru_entry,                       105,    FSCK_AUTOFIX)   \
-       x(alloc_key_data_type_wrong,                            106,    FSCK_AUTOFIX)   \
-       x(alloc_key_gen_wrong,                                  107,    FSCK_AUTOFIX)   \
-       x(alloc_key_dirty_sectors_wrong,                        108,    FSCK_AUTOFIX)   \
-       x(alloc_key_cached_sectors_wrong,                       109,    FSCK_AUTOFIX)   \
-       x(alloc_key_stripe_wrong,                               110,    FSCK_AUTOFIX)   \
-       x(alloc_key_stripe_redundancy_wrong,                    111,    FSCK_AUTOFIX)   \
-       x(alloc_key_journal_seq_in_future,                      298,    FSCK_AUTOFIX)   \
-       x(bucket_sector_count_overflow,                         112,    0)              \
-       x(bucket_metadata_type_mismatch,                        113,    0)              \
-       x(need_discard_key_wrong,                               114,    FSCK_AUTOFIX)   \
-       x(freespace_key_wrong,                                  115,    FSCK_AUTOFIX)   \
-       x(freespace_hole_missing,                               116,    FSCK_AUTOFIX)   \
-       x(bucket_gens_val_size_bad,                             117,    0)              \
-       x(bucket_gens_key_wrong,                                118,    FSCK_AUTOFIX)   \
-       x(bucket_gens_hole_wrong,                               119,    FSCK_AUTOFIX)   \
-       x(bucket_gens_to_invalid_dev,                           120,    FSCK_AUTOFIX)   \
-       x(bucket_gens_to_invalid_buckets,                       121,    FSCK_AUTOFIX)   \
-       x(bucket_gens_nonzero_for_invalid_buckets,              122,    FSCK_AUTOFIX)   \
-       x(need_discard_freespace_key_to_invalid_dev_bucket,     123,    0)              \
-       x(need_discard_freespace_key_bad,                       124,    FSCK_AUTOFIX)   \
-       x(discarding_bucket_not_in_need_discard_btree,          291,    0)              \
-       x(backpointer_bucket_offset_wrong,                      125,    0)              \
-       x(backpointer_level_bad,                                294,    0)              \
-       x(backpointer_dev_bad,                                  297,    0)              \
-       x(backpointer_to_missing_device,                        126,    FSCK_AUTOFIX)   \
-       x(backpointer_to_missing_alloc,                         127,    FSCK_AUTOFIX)   \
-       x(backpointer_to_missing_ptr,                           128,    FSCK_AUTOFIX)   \
-       x(lru_entry_at_time_0,                                  129,    FSCK_AUTOFIX)   \
-       x(lru_entry_to_invalid_bucket,                          130,    FSCK_AUTOFIX)   \
-       x(lru_entry_bad,                                        131,    FSCK_AUTOFIX)   \
-       x(btree_ptr_val_too_big,                                132,    0)              \
-       x(btree_ptr_v2_val_too_big,                             133,    0)              \
-       x(btree_ptr_has_non_ptr,                                134,    0)              \
-       x(extent_ptrs_invalid_entry,                            135,    0)              \
-       x(extent_ptrs_no_ptrs,                                  136,    0)              \
-       x(extent_ptrs_too_many_ptrs,                            137,    0)              \
-       x(extent_ptrs_redundant_crc,                            138,    0)              \
-       x(extent_ptrs_redundant_stripe,                         139,    0)              \
-       x(extent_ptrs_unwritten,                                140,    0)              \
-       x(extent_ptrs_written_and_unwritten,                    141,    0)              \
-       x(ptr_to_invalid_device,                                142,    0)              \
-       x(ptr_to_duplicate_device,                              143,    0)              \
-       x(ptr_after_last_bucket,                                144,    0)              \
-       x(ptr_before_first_bucket,                              145,    0)              \
-       x(ptr_spans_multiple_buckets,                           146,    0)              \
-       x(ptr_to_missing_backpointer,                           147,    FSCK_AUTOFIX)   \
-       x(ptr_to_missing_alloc_key,                             148,    FSCK_AUTOFIX)   \
-       x(ptr_to_missing_replicas_entry,                        149,    FSCK_AUTOFIX)   \
-       x(ptr_to_missing_stripe,                                150,    0)              \
-       x(ptr_to_incorrect_stripe,                              151,    0)              \
-       x(ptr_gen_newer_than_bucket_gen,                        152,    FSCK_AUTOFIX)           \
-       x(ptr_too_stale,                                        153,    0)              \
-       x(stale_dirty_ptr,                                      154,    FSCK_AUTOFIX)   \
-       x(ptr_bucket_data_type_mismatch,                        155,    0)              \
-       x(ptr_cached_and_erasure_coded,                         156,    0)              \
-       x(ptr_crc_uncompressed_size_too_small,                  157,    0)              \
-       x(ptr_crc_uncompressed_size_too_big,                    161,    0)              \
-       x(ptr_crc_uncompressed_size_mismatch,                   300,    0)              \
-       x(ptr_crc_csum_type_unknown,                            158,    0)              \
-       x(ptr_crc_compression_type_unknown,                     159,    0)              \
-       x(ptr_crc_redundant,                                    160,    0)              \
-       x(ptr_crc_nonce_mismatch,                               162,    0)              \
-       x(ptr_stripe_redundant,                                 163,    0)              \
-       x(extent_flags_not_at_start,                            306,    0)              \
-       x(reservation_key_nr_replicas_invalid,                  164,    0)              \
-       x(reflink_v_refcount_wrong,                             165,    FSCK_AUTOFIX)   \
-       x(reflink_v_pos_bad,                                    292,    0)              \
-       x(reflink_p_to_missing_reflink_v,                       166,    FSCK_AUTOFIX)   \
-       x(reflink_refcount_underflow,                           293,    0)              \
-       x(stripe_pos_bad,                                       167,    0)              \
-       x(stripe_val_size_bad,                                  168,    0)              \
-       x(stripe_csum_granularity_bad,                          290,    0)              \
-       x(stripe_sector_count_wrong,                            169,    0)              \
-       x(snapshot_tree_pos_bad,                                170,    0)              \
-       x(snapshot_tree_to_missing_snapshot,                    171,    0)              \
-       x(snapshot_tree_to_missing_subvol,                      172,    0)              \
-       x(snapshot_tree_to_wrong_subvol,                        173,    0)              \
-       x(snapshot_tree_to_snapshot_subvol,                     174,    0)              \
-       x(snapshot_pos_bad,                                     175,    0)              \
-       x(snapshot_parent_bad,                                  176,    0)              \
-       x(snapshot_children_not_normalized,                     177,    0)              \
-       x(snapshot_child_duplicate,                             178,    0)              \
-       x(snapshot_child_bad,                                   179,    0)              \
-       x(snapshot_skiplist_not_normalized,                     180,    0)              \
-       x(snapshot_skiplist_bad,                                181,    0)              \
-       x(snapshot_should_not_have_subvol,                      182,    0)              \
-       x(snapshot_to_bad_snapshot_tree,                        183,    FSCK_AUTOFIX)   \
-       x(snapshot_bad_depth,                                   184,    0)              \
-       x(snapshot_bad_skiplist,                                185,    0)              \
-       x(subvol_pos_bad,                                       186,    0)              \
-       x(subvol_not_master_and_not_snapshot,                   187,    FSCK_AUTOFIX)   \
-       x(subvol_to_missing_root,                               188,    0)              \
-       x(subvol_root_wrong_bi_subvol,                          189,    FSCK_AUTOFIX)   \
-       x(bkey_in_missing_snapshot,                             190,    0)              \
-       x(bkey_in_deleted_snapshot,                             315,    FSCK_AUTOFIX)   \
-       x(inode_pos_inode_nonzero,                              191,    0)              \
-       x(inode_pos_blockdev_range,                             192,    0)              \
-       x(inode_alloc_cursor_inode_bad,                         301,    0)              \
-       x(inode_unpack_error,                                   193,    0)              \
-       x(inode_str_hash_invalid,                               194,    0)              \
-       x(inode_v3_fields_start_bad,                            195,    0)              \
-       x(inode_snapshot_mismatch,                              196,    0)              \
-       x(snapshot_key_missing_inode_snapshot,                  314,    FSCK_AUTOFIX)   \
-       x(inode_unlinked_but_clean,                             197,    0)              \
-       x(inode_unlinked_but_nlink_nonzero,                     198,    0)              \
-       x(inode_unlinked_and_not_open,                          281,    0)              \
-       x(inode_unlinked_but_has_dirent,                        285,    0)              \
-       x(inode_checksum_type_invalid,                          199,    0)              \
-       x(inode_compression_type_invalid,                       200,    0)              \
-       x(inode_subvol_root_but_not_dir,                        201,    0)              \
-       x(inode_i_size_dirty_but_clean,                         202,    FSCK_AUTOFIX)   \
-       x(inode_i_sectors_dirty_but_clean,                      203,    FSCK_AUTOFIX)   \
-       x(inode_i_sectors_wrong,                                204,    FSCK_AUTOFIX)   \
-       x(inode_dir_wrong_nlink,                                205,    FSCK_AUTOFIX)   \
-       x(inode_dir_multiple_links,                             206,    FSCK_AUTOFIX)   \
-       x(inode_dir_missing_backpointer,                        284,    FSCK_AUTOFIX)   \
-       x(inode_dir_unlinked_but_not_empty,                     286,    FSCK_AUTOFIX)   \
-       x(inode_dir_has_nonzero_i_size,                         319,    FSCK_AUTOFIX)   \
-       x(inode_multiple_links_but_nlink_0,                     207,    FSCK_AUTOFIX)   \
-       x(inode_wrong_backpointer,                              208,    FSCK_AUTOFIX)   \
-       x(inode_wrong_nlink,                                    209,    FSCK_AUTOFIX)   \
-       x(inode_has_child_snapshots_wrong,                      287,    FSCK_AUTOFIX)   \
-       x(inode_unreachable,                                    210,    FSCK_AUTOFIX)   \
-       x(inode_journal_seq_in_future,                          299,    FSCK_AUTOFIX)   \
-       x(inode_i_sectors_underflow,                            312,    FSCK_AUTOFIX)   \
-       x(inode_has_case_insensitive_not_set,                   316,    FSCK_AUTOFIX)   \
-       x(inode_parent_has_case_insensitive_not_set,            317,    FSCK_AUTOFIX)   \
-       x(vfs_inode_i_blocks_underflow,                         311,    FSCK_AUTOFIX)   \
-       x(vfs_inode_i_blocks_not_zero_at_truncate,              313,    FSCK_AUTOFIX)   \
-       x(vfs_bad_inode_rm,                                     320,    0)              \
-       x(deleted_inode_but_clean,                              211,    FSCK_AUTOFIX)   \
-       x(deleted_inode_missing,                                212,    FSCK_AUTOFIX)   \
-       x(deleted_inode_is_dir,                                 213,    FSCK_AUTOFIX)   \
-       x(deleted_inode_not_unlinked,                           214,    FSCK_AUTOFIX)   \
-       x(deleted_inode_has_child_snapshots,                    288,    FSCK_AUTOFIX)   \
-       x(extent_overlapping,                                   215,    0)              \
-       x(key_in_missing_inode,                                 216,    FSCK_AUTOFIX)   \
-       x(key_in_wrong_inode_type,                              217,    0)              \
-       x(extent_past_end_of_inode,                             218,    FSCK_AUTOFIX)   \
-       x(dirent_empty_name,                                    219,    0)              \
-       x(dirent_val_too_big,                                   220,    0)              \
-       x(dirent_name_too_long,                                 221,    0)              \
-       x(dirent_name_embedded_nul,                             222,    0)              \
-       x(dirent_name_dot_or_dotdot,                            223,    0)              \
-       x(dirent_name_has_slash,                                224,    0)              \
-       x(dirent_d_type_wrong,                                  225,    FSCK_AUTOFIX)   \
-       x(inode_bi_parent_wrong,                                226,    0)              \
-       x(dirent_in_missing_dir_inode,                          227,    0)              \
-       x(dirent_in_non_dir_inode,                              228,    0)              \
-       x(dirent_to_missing_inode,                              229,    FSCK_AUTOFIX)   \
-       x(dirent_to_overwritten_inode,                          302,    0)              \
-       x(dirent_to_missing_subvol,                             230,    0)              \
-       x(dirent_to_itself,                                     231,    0)              \
-       x(dirent_casefold_mismatch,                             318,    FSCK_AUTOFIX)   \
-       x(quota_type_invalid,                                   232,    0)              \
-       x(xattr_val_size_too_small,                             233,    0)              \
-       x(xattr_val_size_too_big,                               234,    0)              \
-       x(xattr_invalid_type,                                   235,    0)              \
-       x(xattr_name_invalid_chars,                             236,    0)              \
-       x(xattr_in_missing_inode,                               237,    0)              \
-       x(root_subvol_missing,                                  238,    0)              \
-       x(root_dir_missing,                                     239,    0)              \
-       x(root_inode_not_dir,                                   240,    0)              \
-       x(dir_loop,                                             241,    0)              \
-       x(hash_table_key_duplicate,                             242,    FSCK_AUTOFIX)   \
-       x(hash_table_key_wrong_offset,                          243,    FSCK_AUTOFIX)   \
-       x(unlinked_inode_not_on_deleted_list,                   244,    FSCK_AUTOFIX)   \
-       x(reflink_p_front_pad_bad,                              245,    0)              \
-       x(journal_entry_dup_same_device,                        246,    0)              \
-       x(inode_bi_subvol_missing,                              247,    0)              \
-       x(inode_bi_subvol_wrong,                                248,    0)              \
-       x(inode_points_to_missing_dirent,                       249,    FSCK_AUTOFIX)   \
-       x(inode_points_to_wrong_dirent,                         250,    FSCK_AUTOFIX)   \
-       x(inode_bi_parent_nonzero,                              251,    0)              \
-       x(dirent_to_missing_parent_subvol,                      252,    0)              \
-       x(dirent_not_visible_in_parent_subvol,                  253,    0)              \
-       x(subvol_fs_path_parent_wrong,                          254,    0)              \
-       x(subvol_root_fs_path_parent_nonzero,                   255,    0)              \
-       x(subvol_children_not_set,                              256,    0)              \
-       x(subvol_children_bad,                                  257,    0)              \
-       x(subvol_loop,                                          258,    0)              \
-       x(subvol_unreachable,                                   259,    FSCK_AUTOFIX)   \
-       x(btree_node_bkey_bad_u64s,                             260,    0)              \
-       x(btree_node_topology_empty_interior_node,              261,    0)              \
-       x(btree_ptr_v2_min_key_bad,                             262,    0)              \
-       x(btree_root_unreadable_and_scan_found_nothing,         263,    0)              \
-       x(snapshot_node_missing,                                264,    FSCK_AUTOFIX)   \
-       x(dup_backpointer_to_bad_csum_extent,                   265,    0)              \
-       x(btree_bitmap_not_marked,                              266,    FSCK_AUTOFIX)   \
-       x(sb_clean_entry_overrun,                               267,    0)              \
-       x(btree_ptr_v2_written_0,                               268,    0)              \
-       x(subvol_snapshot_bad,                                  269,    0)              \
-       x(subvol_inode_bad,                                     270,    0)              \
-       x(subvol_missing,                                       308,    FSCK_AUTOFIX)   \
-       x(alloc_key_stripe_sectors_wrong,                       271,    FSCK_AUTOFIX)   \
-       x(accounting_mismatch,                                  272,    FSCK_AUTOFIX)   \
-       x(accounting_replicas_not_marked,                       273,    0)              \
-       x(accounting_to_invalid_device,                         289,    0)              \
-       x(invalid_btree_id,                                     274,    FSCK_AUTOFIX)           \
-       x(alloc_key_io_time_bad,                                275,    0)              \
-       x(alloc_key_fragmentation_lru_wrong,                    276,    FSCK_AUTOFIX)   \
-       x(accounting_key_junk_at_end,                           277,    FSCK_AUTOFIX)   \
-       x(accounting_key_replicas_nr_devs_0,                    278,    FSCK_AUTOFIX)   \
-       x(accounting_key_replicas_nr_required_bad,              279,    FSCK_AUTOFIX)   \
-       x(accounting_key_replicas_devs_unsorted,                280,    FSCK_AUTOFIX)   \
-       x(accounting_key_version_0,                             282,    FSCK_AUTOFIX)   \
-       x(accounting_key_nr_counters_wrong,                     307,    FSCK_AUTOFIX)   \
-       x(logged_op_but_clean,                                  283,    FSCK_AUTOFIX)   \
-       x(compression_opt_not_marked_in_sb,                     295,    FSCK_AUTOFIX)   \
-       x(compression_type_not_marked_in_sb,                    296,    FSCK_AUTOFIX)   \
-       x(directory_size_mismatch,                              303,    FSCK_AUTOFIX)   \
-       x(dirent_cf_name_too_big,                               304,    0)              \
-       x(dirent_stray_data_after_cf_name,                      305,    0)              \
-       x(rebalance_work_incorrectly_set,                       309,    FSCK_AUTOFIX)   \
-       x(rebalance_work_incorrectly_unset,                     310,    FSCK_AUTOFIX)   \
-       x(MAX,                                                  321,    0)
-
-enum bch_sb_error_id {
-#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-       BCH_SB_ERRS()
-#undef x
-};
-
-struct bch_sb_field_errors {
-       struct bch_sb_field     field;
-       struct bch_sb_field_error_entry {
-               __le64          v;
-               __le64          last_error_time;
-       }                       entries[];
-};
-
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID,    struct bch_sb_field_error_entry, v,  0, 16);
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR,    struct bch_sb_field_error_entry, v, 16, 64);
-
-#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
deleted file mode 100644 (file)
index 4032523..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
-#define _BCACHEFS_SB_ERRORS_TYPES_H
-
-#include "darray.h"
-
-struct bch_sb_error_entry_cpu {
-       u64                     id:16,
-                               nr:48;
-       u64                     last_error_time;
-};
-
-typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
-
-#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
deleted file mode 100644 (file)
index 6245e34..0000000
+++ /dev/null
@@ -1,606 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "opts.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "super-io.h"
-
-int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
-{
-       struct printbuf buf = PRINTBUF;
-       bch2_log_msg_start(c, &buf);
-
-       prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev);
-       bch2_bkey_val_to_text(&buf, c, k);
-
-       bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf);
-
-       int ret = bch2_run_explicit_recovery_pass(c, &buf,
-                                       BCH_RECOVERY_PASS_check_allocations, 0);
-
-       if (print)
-               bch2_print_str(c, KERN_ERR, buf.buf);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev)
-{
-       if (dev != BCH_SB_MEMBER_INVALID)
-               bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
-}
-
-void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket)
-{
-       bch2_fs_inconsistent(ca->fs,
-               "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)",
-               bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets);
-}
-
-#define x(t, n, ...) [n] = #t,
-static const char * const bch2_iops_measurements[] = {
-       BCH_IOPS_MEASUREMENTS()
-       NULL
-};
-
-char * const bch2_member_error_strs[] = {
-       BCH_MEMBER_ERROR_TYPES()
-       NULL
-};
-#undef x
-
-/* Code for bch_sb_field_members_v1: */
-
-struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
-{
-       return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
-}
-
-static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
-{
-       struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i);
-       memset(&ret, 0, sizeof(ret));
-       memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
-       return ret;
-}
-
-static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i)
-{
-       return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
-}
-
-static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i)
-{
-       struct bch_member ret, *p = members_v1_get_mut(mi, i);
-       memset(&ret, 0, sizeof(ret));
-       memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret)));
-       return ret;
-}
-
-struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
-{
-       struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2);
-       if (mi2)
-               return members_v2_get(mi2, i);
-       struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1);
-       return members_v1_get(mi1, i);
-}
-
-static int sb_members_v2_resize_entries(struct bch_fs *c)
-{
-       struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-
-       if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
-               unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
-                                             c->disk_sb.sb->nr_devices), 8);
-
-               mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
-               if (!mi)
-                       return bch_err_throw(c, ENOSPC_sb_members_v2);
-
-               for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
-                       void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
-                       memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
-                       memset(dst + le16_to_cpu(mi->member_bytes),
-                              0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
-               }
-               mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
-       }
-       return 0;
-}
-
-int bch2_sb_members_v2_init(struct bch_fs *c)
-{
-       struct bch_sb_field_members_v1 *mi1;
-       struct bch_sb_field_members_v2 *mi2;
-
-       if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) {
-               mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2,
-                               DIV_ROUND_UP(sizeof(*mi2) +
-                                            sizeof(struct bch_member) * c->sb.nr_devices,
-                                            sizeof(u64)));
-               mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1);
-               memcpy(&mi2->_members[0], &mi1->_members[0],
-                      BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
-               memset(&mi2->pad[0], 0, sizeof(mi2->pad));
-               mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES);
-       }
-
-       return sb_members_v2_resize_entries(c);
-}
-
-int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
-{
-       struct bch_sb_field_members_v1 *mi1;
-       struct bch_sb_field_members_v2 *mi2;
-
-       if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) {
-               bch2_sb_field_resize(disk_sb, members_v1, 0);
-               return 0;
-       }
-
-       mi1 = bch2_sb_field_resize(disk_sb, members_v1,
-                       DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
-                                    disk_sb->sb->nr_devices, sizeof(u64)));
-       if (!mi1)
-               return -BCH_ERR_ENOSPC_sb_members;
-
-       mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
-
-       for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
-               memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
-
-       return 0;
-}
-
-static int validate_member(struct printbuf *err,
-                          struct bch_member m,
-                          struct bch_sb *sb,
-                          int i)
-{
-       if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) {
-               prt_printf(err, "device %u: too many buckets (got %llu, max %u)",
-                          i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX);
-               return -BCH_ERR_invalid_sb_members;
-       }
-
-       if (le64_to_cpu(m.nbuckets) -
-           le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) {
-               prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
-                          i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS);
-               return -BCH_ERR_invalid_sb_members;
-       }
-
-       if (le16_to_cpu(m.bucket_size) <
-           le16_to_cpu(sb->block_size)) {
-               prt_printf(err, "device %u: bucket size %u smaller than block size %u",
-                          i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size));
-               return -BCH_ERR_invalid_sb_members;
-       }
-
-       if (le16_to_cpu(m.bucket_size) <
-           BCH_SB_BTREE_NODE_SIZE(sb)) {
-               prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
-                          i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
-               return -BCH_ERR_invalid_sb_members;
-       }
-
-       if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) {
-               prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
-               return -BCH_ERR_invalid_sb_members;
-       }
-
-       if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) &&
-           sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) {
-               prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i);
-               return -BCH_ERR_invalid_sb_members;
-       }
-
-       return 0;
-}
-
-static void member_to_text(struct printbuf *out,
-                          struct bch_member m,
-                          struct bch_sb_field_disk_groups *gi,
-                          struct bch_sb *sb,
-                          int i)
-{
-       unsigned data_have = bch2_sb_dev_has_data(sb, i);
-       u64 bucket_size = le16_to_cpu(m.bucket_size);
-       u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
-
-       if (!bch2_member_alive(&m))
-               return;
-
-       prt_printf(out, "Device:\t%u\n", i);
-
-       printbuf_indent_add(out, 2);
-
-       prt_printf(out, "Label:\t");
-       if (BCH_MEMBER_GROUP(&m))
-               bch2_disk_path_to_text_sb(out, sb,
-                               BCH_MEMBER_GROUP(&m) - 1);
-       else
-               prt_printf(out, "(none)");
-       prt_newline(out);
-
-       prt_printf(out, "UUID:\t");
-       pr_uuid(out, m.uuid.b);
-       prt_newline(out);
-
-       prt_printf(out, "Size:\t");
-       prt_units_u64(out, device_size << 9);
-       prt_newline(out);
-
-       for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
-               prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i]));
-
-       for (unsigned i = 0; i < BCH_IOPS_NR; i++)
-               prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i]));
-
-       prt_printf(out, "Bucket size:\t");
-       prt_units_u64(out, bucket_size << 9);
-       prt_newline(out);
-
-       prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket));
-       prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets));
-
-       prt_printf(out, "Last mount:\t");
-       if (m.last_mount)
-               bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
-       else
-               prt_printf(out, "(never)");
-       prt_newline(out);
-
-       prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq));
-
-       prt_printf(out, "State:\t%s\n",
-                  BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
-                  ? bch2_member_states[BCH_MEMBER_STATE(&m)]
-                  : "unknown");
-
-       prt_printf(out, "Data allowed:\t");
-       if (BCH_MEMBER_DATA_ALLOWED(&m))
-               prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
-       else
-               prt_printf(out, "(none)");
-       prt_newline(out);
-
-       prt_printf(out, "Has data:\t");
-       if (data_have)
-               prt_bitflags(out, __bch2_data_types, data_have);
-       else
-               prt_printf(out, "(none)");
-       prt_newline(out);
-
-       prt_printf(out, "Btree allocated bitmap blocksize:\t");
-       if (m.btree_bitmap_shift < 64)
-               prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
-       else
-               prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
-       prt_newline(out);
-
-       prt_printf(out, "Btree allocated bitmap:\t");
-       bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64);
-       prt_newline(out);
-
-       prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
-
-       prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
-       prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
-       prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m));
-
-       printbuf_indent_sub(out, 2);
-}
-
-static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
-       unsigned i;
-
-       if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) {
-               prt_printf(err, "too many devices for section size");
-               return -BCH_ERR_invalid_sb_members;
-       }
-
-       for (i = 0; i < sb->nr_devices; i++) {
-               struct bch_member m = members_v1_get(mi, i);
-
-               int ret = validate_member(err, m, sb, i);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
-                                      struct bch_sb_field *f)
-{
-       struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
-       struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
-
-       if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
-               prt_printf(out, "field ends before start of entries");
-               return;
-       }
-
-       unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]);
-       if (nr != sb->nr_devices)
-               prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
-
-       for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
-               member_to_text(out, members_v1_get(mi, i), gi, sb, i);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
-       .validate       = bch2_sb_members_v1_validate,
-       .to_text        = bch2_sb_members_v1_to_text,
-};
-
-static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
-                                      struct bch_sb_field *f)
-{
-       struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
-       struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
-
-       if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
-               prt_printf(out, "field ends before start of entries");
-               return;
-       }
-
-       if (!le16_to_cpu(mi->member_bytes)) {
-               prt_printf(out, "member_bytes 0");
-               return;
-       }
-
-       unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes);
-       if (nr != sb->nr_devices)
-               prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
-
-       /*
-        * We call to_text() on superblock sections that haven't passed
-        * validate, so we can't trust sb->nr_devices.
-        */
-
-       for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
-               member_to_text(out, members_v2_get(mi, i), gi, sb, i);
-}
-
-static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum bch_validate_flags flags, struct printbuf *err)
-{
-       struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
-       size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
-               (void *) mi;
-
-       if (mi_bytes > vstruct_bytes(&mi->field)) {
-               prt_printf(err, "section too small (%zu > %zu)",
-                          mi_bytes, vstruct_bytes(&mi->field));
-               return -BCH_ERR_invalid_sb_members;
-       }
-
-       for (unsigned i = 0; i < sb->nr_devices; i++) {
-               int ret = validate_member(err, members_v2_get(mi, i), sb, i);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
-       .validate       = bch2_sb_members_v2_validate,
-       .to_text        = bch2_sb_members_v2_to_text,
-};
-
-void bch2_sb_members_from_cpu(struct bch_fs *c)
-{
-       struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-
-       guard(rcu)();
-       for_each_member_device_rcu(c, ca, NULL) {
-               struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
-
-               for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
-                       m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
-       }
-}
-
-void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-       struct bch_fs *c = ca->fs;
-       struct bch_member m;
-
-       mutex_lock(&ca->fs->sb_lock);
-       m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
-       mutex_unlock(&ca->fs->sb_lock);
-
-       printbuf_tabstop_push(out, 12);
-
-       prt_str(out, "IO errors since filesystem creation");
-       prt_newline(out);
-
-       printbuf_indent_add(out, 2);
-       for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
-               prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i]));
-       printbuf_indent_sub(out, 2);
-
-       prt_str(out, "IO errors since ");
-       bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC);
-       prt_str(out, " ago");
-       prt_newline(out);
-
-       printbuf_indent_add(out, 2);
-       for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
-               prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i],
-                          atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
-       printbuf_indent_sub(out, 2);
-}
-
-void bch2_dev_errors_reset(struct bch_dev *ca)
-{
-       struct bch_fs *c = ca->fs;
-       struct bch_member *m;
-
-       mutex_lock(&c->sb_lock);
-       m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-       for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
-               m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
-       m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds());
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-}
-
-/*
- * Per member "range has btree nodes" bitmap:
- *
- * This is so that if we ever have to run the btree node scan to repair we don't
- * have to scan full devices:
- */
-
-bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
-{
-       guard(rcu)();
-       bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
-               struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-               if (ca &&
-                   !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c)))
-                       return false;
-       }
-       return true;
-}
-
-static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
-                               u64 start, unsigned sectors)
-{
-       struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
-       u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
-
-       u64 end = start + sectors;
-
-       int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
-       if (resize > 0) {
-               u64 new_bitmap = 0;
-
-               for (unsigned i = 0; i < 64; i++)
-                       if (bitmap & BIT_ULL(i))
-                               new_bitmap |= BIT_ULL(i >> resize);
-               bitmap = new_bitmap;
-               m->btree_bitmap_shift += resize;
-       }
-
-       BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX);
-       BUG_ON(end > 64ULL << m->btree_bitmap_shift);
-
-       for (unsigned bit = start >> m->btree_bitmap_shift;
-            (u64) bit << m->btree_bitmap_shift < end;
-            bit++)
-               bitmap |= BIT_ULL(bit);
-
-       m->btree_allocated_bitmap = cpu_to_le64(bitmap);
-}
-
-void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
-{
-       lockdep_assert_held(&c->sb_lock);
-
-       struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-       bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
-               if (!bch2_member_exists(c->disk_sb.sb, ptr->dev))
-                       continue;
-
-               __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
-       }
-}
-
-unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
-{
-       unsigned nr = 0;
-
-       for (unsigned i = 0; i < sb->nr_devices; i++)
-               nr += bch2_member_exists((struct bch_sb *) sb, i);
-       return nr;
-}
-
-int bch2_sb_member_alloc(struct bch_fs *c)
-{
-       unsigned dev_idx = c->sb.nr_devices;
-       struct bch_sb_field_members_v2 *mi;
-       unsigned nr_devices;
-       unsigned u64s;
-       int best = -1;
-       u64 best_last_mount = 0;
-       unsigned nr_deleted = 0;
-
-       if (dev_idx < BCH_SB_MEMBERS_MAX)
-               goto have_slot;
-
-       for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
-               /* eventually BCH_SB_MEMBERS_MAX will be raised */
-               if (dev_idx == BCH_SB_MEMBER_INVALID)
-                       continue;
-
-               struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
-
-               nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID);
-
-               if (!bch2_is_zero(&m.uuid, sizeof(m.uuid)))
-                       continue;
-
-               u64 last_mount = le64_to_cpu(m.last_mount);
-               if (best < 0 || last_mount < best_last_mount) {
-                       best = dev_idx;
-                       best_last_mount = last_mount;
-               }
-       }
-       if (best >= 0) {
-               dev_idx = best;
-               goto have_slot;
-       }
-
-       if (nr_deleted)
-               bch_err(c, "unable to allocate new member, but have %u deleted: run fsck",
-                       nr_deleted);
-
-       return -BCH_ERR_ENOSPC_sb_members;
-have_slot:
-       nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-
-       mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-       u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
-                           le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
-
-       mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
-       if (!mi)
-               return -BCH_ERR_ENOSPC_sb_members;
-
-       c->disk_sb.sb->nr_devices = nr_devices;
-       return dev_idx;
-}
-
-void bch2_sb_members_clean_deleted(struct bch_fs *c)
-{
-       mutex_lock(&c->sb_lock);
-       bool write_sb = false;
-
-       for (unsigned i = 0; i < c->sb.nr_devices; i++) {
-               struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i);
-
-               if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) {
-                       memset(&m->uuid, 0, sizeof(m->uuid));
-                       write_sb = true;
-               }
-       }
-
-       if (write_sb)
-               bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
deleted file mode 100644 (file)
index 8d8a8a8..0000000
+++ /dev/null
@@ -1,377 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_H
-#define _BCACHEFS_SB_MEMBERS_H
-
-#include "darray.h"
-#include "bkey_types.h"
-#include "enumerated_ref.h"
-
-extern char * const bch2_member_error_strs[];
-
-static inline struct bch_member *
-__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i)
-{
-       return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
-}
-
-int bch2_sb_members_v2_init(struct bch_fs *c);
-int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
-struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
-struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
-
-static inline bool bch2_dev_is_online(struct bch_dev *ca)
-{
-       return !enumerated_ref_is_zero(&ca->io_ref[READ]);
-}
-
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
-
-static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
-{
-       guard(rcu)();
-       struct bch_dev *ca = bch2_dev_rcu(c, dev);
-       return ca && bch2_dev_is_online(ca);
-}
-
-static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
-{
-       return bch2_dev_is_online(ca) &&
-               ca->mi.state != BCH_MEMBER_STATE_failed;
-}
-
-static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
-{
-       return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
-}
-
-static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
-                                        unsigned dev)
-{
-       darray_for_each(devs, i)
-               if (*i == dev)
-                       return true;
-       return false;
-}
-
-static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
-                                         unsigned dev)
-{
-       darray_for_each(*devs, i)
-               if (*i == dev) {
-                       darray_remove_item(devs, i);
-                       return;
-               }
-}
-
-static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
-                                        unsigned dev)
-{
-       if (!bch2_dev_list_has_dev(*devs, dev)) {
-               BUG_ON(devs->nr >= ARRAY_SIZE(devs->data));
-               devs->data[devs->nr++] = dev;
-       }
-}
-
-static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
-{
-       return (struct bch_devs_list) { .nr = 1, .data[0] = dev };
-}
-
-static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx,
-                                                 const struct bch_devs_mask *mask)
-{
-       struct bch_dev *ca = NULL;
-
-       while ((idx = mask
-               ? find_next_bit(mask->d, c->sb.nr_devices, idx)
-               : idx) < c->sb.nr_devices &&
-              !(ca = rcu_dereference_check(c->devs[idx],
-                                           lockdep_is_held(&c->state_lock))))
-               idx++;
-
-       return ca;
-}
-
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca,
-                                             const struct bch_devs_mask *mask)
-{
-       return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask);
-}
-
-#define for_each_member_device_rcu(_c, _ca, _mask)                     \
-       for (struct bch_dev *_ca = NULL;                                \
-            (_ca = __bch2_next_dev((_c), _ca, (_mask)));)
-
-#define for_each_online_member_rcu(_c, _ca)                            \
-       for_each_member_device_rcu(_c, _ca, &(_c)->online_devs)
-
-#define for_each_rw_member_rcu(_c, _ca)                                        \
-       for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free])
-
-static inline void bch2_dev_get(struct bch_dev *ca)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L);
-#else
-       percpu_ref_get(&ca->ref);
-#endif
-}
-
-static inline void __bch2_dev_put(struct bch_dev *ca)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       long r = atomic_long_dec_return(&ca->ref);
-       if (r < (long) !ca->dying)
-               panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put);
-       ca->last_put = _THIS_IP_;
-       if (!r)
-               complete(&ca->ref_completion);
-#else
-       percpu_ref_put(&ca->ref);
-#endif
-}
-
-static inline void bch2_dev_put(struct bch_dev *ca)
-{
-       if (ca)
-               __bch2_dev_put(ca);
-}
-
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
-{
-       guard(rcu)();
-       bch2_dev_put(ca);
-       if ((ca = __bch2_next_dev(c, ca, NULL)))
-               bch2_dev_get(ca);
-       return ca;
-}
-
-/*
- * If you break early, you must drop your ref on the current device
- */
-#define __for_each_member_device(_c, _ca)                              \
-       for (;  (_ca = bch2_get_next_dev(_c, _ca));)
-
-#define for_each_member_device(_c, _ca)                                        \
-       for (struct bch_dev *_ca = NULL;                                \
-            (_ca = bch2_get_next_dev(_c, _ca));)
-
-static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
-                                                      struct bch_dev *ca,
-                                                      unsigned state_mask,
-                                                      int rw, unsigned ref_idx)
-{
-       guard(rcu)();
-       if (ca)
-               enumerated_ref_put(&ca->io_ref[rw], ref_idx);
-
-       while ((ca = __bch2_next_dev(c, ca, NULL)) &&
-              (!((1 << ca->mi.state) & state_mask) ||
-               !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)))
-               ;
-
-       return ca;
-}
-
-#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx)     \
-       for (struct bch_dev *_ca = NULL;                                \
-            (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));)
-
-#define for_each_online_member(c, ca, ref_idx)                         \
-       __for_each_online_member(c, ca, ~0, READ, ref_idx)
-
-#define for_each_rw_member(c, ca, ref_idx)                                     \
-       __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx)
-
-#define for_each_readable_member(c, ca, ref_idx)                               \
-       __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx)
-
-static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
-{
-       return dev < c->sb.nr_devices && c->devs[dev];
-}
-
-static inline bool bucket_valid(const struct bch_dev *ca, u64 b)
-{
-       return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first;
-}
-
-static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev)
-{
-       EBUG_ON(!bch2_dev_exists(c, dev));
-
-       return rcu_dereference_check(c->devs[dev], 1);
-}
-
-static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
-{
-       EBUG_ON(!bch2_dev_exists(c, dev));
-
-       return rcu_dereference_protected(c->devs[dev],
-                                        lockdep_is_held(&c->sb_lock) ||
-                                        lockdep_is_held(&c->state_lock));
-}
-
-static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
-{
-       return c && dev < c->sb.nr_devices
-               ? rcu_dereference(c->devs[dev])
-               : NULL;
-}
-
-int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_dev_missing_atomic(struct bch_fs *, unsigned);
-
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
-{
-       struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
-       if (unlikely(!ca))
-               bch2_dev_missing_atomic(c, dev);
-       return ca;
-}
-
-static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
-{
-       guard(rcu)();
-       struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
-       if (ca)
-               bch2_dev_get(ca);
-       return ca;
-}
-
-static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
-{
-       struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
-       if (unlikely(!ca))
-               bch2_dev_missing_atomic(c, dev);
-       return ca;
-}
-
-static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
-{
-       struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
-       if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
-               bch2_dev_put(ca);
-               ca = NULL;
-       }
-       return ca;
-}
-
-void bch2_dev_bucket_missing(struct bch_dev *, u64);
-
-static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
-{
-       struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
-       if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
-               bch2_dev_bucket_missing(ca, bucket.offset);
-               bch2_dev_put(ca);
-               ca = NULL;
-       }
-       return ca;
-}
-
-static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
-{
-       if (ca && ca->dev_idx == dev_idx)
-               return ca;
-       bch2_dev_put(ca);
-       return bch2_dev_tryget_noerror(c, dev_idx);
-}
-
-static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
-{
-       if (ca && ca->dev_idx == dev_idx)
-               return ca;
-       bch2_dev_put(ca);
-       return bch2_dev_tryget(c, dev_idx);
-}
-
-static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
-                                                int rw, unsigned ref_idx)
-{
-       might_sleep();
-
-       guard(rcu)();
-       struct bch_dev *ca = bch2_dev_rcu(c, dev);
-       if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))
-               return NULL;
-
-       if (ca->mi.state == BCH_MEMBER_STATE_rw ||
-           (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
-               return ca;
-
-       enumerated_ref_put(&ca->io_ref[rw], ref_idx);
-       return NULL;
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
-extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
-
-static inline bool bch2_member_alive(struct bch_member *m)
-{
-       return  !bch2_is_zero(&m->uuid, sizeof(m->uuid)) &&
-               !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID);
-}
-
-static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
-{
-       if (dev < sb->nr_devices) {
-               struct bch_member m = bch2_sb_member_get(sb, dev);
-               return bch2_member_alive(&m);
-       }
-       return false;
-}
-
-unsigned bch2_sb_nr_devices(const struct bch_sb *);
-
-static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
-{
-       return (struct bch_member_cpu) {
-               .nbuckets       = le64_to_cpu(mi->nbuckets),
-               .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) -
-                       le16_to_cpu(mi->first_bucket),
-               .first_bucket   = le16_to_cpu(mi->first_bucket),
-               .bucket_size    = le16_to_cpu(mi->bucket_size),
-               .group          = BCH_MEMBER_GROUP(mi),
-               .state          = BCH_MEMBER_STATE(mi),
-               .discard        = BCH_MEMBER_DISCARD(mi),
-               .data_allowed   = BCH_MEMBER_DATA_ALLOWED(mi),
-               .durability     = BCH_MEMBER_DURABILITY(mi)
-                       ? BCH_MEMBER_DURABILITY(mi) - 1
-                       : 1,
-               .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
-               .resize_on_mount        = BCH_MEMBER_RESIZE_ON_MOUNT(mi),
-               .valid          = bch2_member_alive(mi),
-               .btree_bitmap_shift     = mi->btree_bitmap_shift,
-               .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
-       };
-}
-
-void bch2_sb_members_from_cpu(struct bch_fs *);
-
-void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
-void bch2_dev_errors_reset(struct bch_dev *);
-
-static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
-{
-       u64 end = start + sectors;
-
-       if (end > 64ULL << ca->mi.btree_bitmap_shift)
-               return false;
-
-       for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
-            (u64) bit << ca->mi.btree_bitmap_shift < end;
-            bit++)
-               if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
-                       return false;
-       return true;
-}
-
-bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
-void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
-
-int bch2_sb_member_alloc(struct bch_fs *);
-void bch2_sb_members_clean_deleted(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
deleted file mode 100644 (file)
index fb72ad7..0000000
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H
-#define _BCACHEFS_SB_MEMBERS_FORMAT_H
-
-/*
- * We refer to members with bitmasks in various places - but we need to get rid
- * of this limit:
- */
-#define BCH_SB_MEMBERS_MAX             64
-
-/*
- * Sentinal value - indicates a device that does not exist
- */
-#define BCH_SB_MEMBER_INVALID          255
-
-#define BCH_SB_MEMBER_DELETED_UUID                                     \
-       UUID_INIT(0xffffffff, 0xffff, 0xffff,                           \
-                 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-
-#define BCH_MIN_NR_NBUCKETS    (1 << 6)
-
-#define BCH_IOPS_MEASUREMENTS()                        \
-       x(seqread,      0)                      \
-       x(seqwrite,     1)                      \
-       x(randread,     2)                      \
-       x(randwrite,    3)
-
-enum bch_iops_measurement {
-#define x(t, n) BCH_IOPS_##t = n,
-       BCH_IOPS_MEASUREMENTS()
-#undef x
-       BCH_IOPS_NR
-};
-
-#define BCH_MEMBER_ERROR_TYPES()               \
-       x(read,         0)                      \
-       x(write,        1)                      \
-       x(checksum,     2)
-
-enum bch_member_error_type {
-#define x(t, n) BCH_MEMBER_ERROR_##t = n,
-       BCH_MEMBER_ERROR_TYPES()
-#undef x
-       BCH_MEMBER_ERROR_NR
-};
-
-struct bch_member {
-       __uuid_t                uuid;
-       __le64                  nbuckets;       /* device size */
-       __le16                  first_bucket;   /* index of first bucket used */
-       __le16                  bucket_size;    /* sectors */
-       __u8                    btree_bitmap_shift;
-       __u8                    pad[3];
-       __le64                  last_mount;     /* time_t */
-
-       __le64                  flags;
-       __le32                  iops[4];
-       __le64                  errors[BCH_MEMBER_ERROR_NR];
-       __le64                  errors_at_reset[BCH_MEMBER_ERROR_NR];
-       __le64                  errors_reset_time;
-       __le64                  seq;
-       __le64                  btree_allocated_bitmap;
-       /*
-        * On recovery from a clean shutdown we don't normally read the journal,
-        * but we still want to resume writing from where we left off so we
-        * don't overwrite more than is necessary, for list journal debugging:
-        */
-       __le32                  last_journal_bucket;
-       __le32                  last_journal_bucket_offset;
-};
-
-/*
- * btree_allocated_bitmap can represent sector addresses of a u64: it itself has
- * 64 elements, so 64 - ilog2(64)
- */
-#define BCH_MI_BTREE_BITMAP_SHIFT_MAX  58
-
-/*
- * This limit comes from the bucket_gens array - it's a single allocation, and
- * kernel allocation are limited to INT_MAX
- */
-#define BCH_MEMBER_NBUCKETS_MAX        (INT_MAX - 64)
-
-#define BCH_MEMBER_V1_BYTES    56
-
-LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE,   struct bch_member, bucket_size,  0, 16)
-LE64_BITMASK(BCH_MEMBER_STATE,         struct bch_member, flags,  0,  4)
-/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD,       struct bch_member, flags, 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,  struct bch_member, flags, 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP,         struct bch_member, flags, 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY,    struct bch_member, flags, 28, 30)
-LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
-                                       struct bch_member, flags, 30, 31)
-LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT,
-                                       struct bch_member, flags, 31, 32)
-
-#if 0
-LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,        struct bch_member, flags[1], 0,  20);
-LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
-#endif
-
-#define BCH_MEMBER_STATES()                    \
-       x(rw,           0)                      \
-       x(ro,           1)                      \
-       x(failed,       2)                      \
-       x(spare,        3)
-
-enum bch_member_state {
-#define x(t, n) BCH_MEMBER_STATE_##t = n,
-       BCH_MEMBER_STATES()
-#undef x
-       BCH_MEMBER_STATE_NR
-};
-
-struct bch_sb_field_members_v1 {
-       struct bch_sb_field     field;
-       struct bch_member       _members[]; //Members are now variable size
-};
-
-struct bch_sb_field_members_v2 {
-       struct bch_sb_field     field;
-       __le16                  member_bytes; //size of single member entry
-       u8                      pad[6];
-       struct bch_member       _members[];
-};
-
-#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h
deleted file mode 100644 (file)
index d6443e1..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H
-#define _BCACHEFS_SB_MEMBERS_TYPES_H
-
-struct bch_member_cpu {
-       u64                     nbuckets;       /* device size */
-       u64                     nbuckets_minus_first;
-       u16                     first_bucket;   /* index of first bucket used */
-       u16                     bucket_size;    /* sectors */
-       u16                     group;
-       u8                      state;
-       u8                      discard;
-       u8                      data_allowed;
-       u8                      durability;
-       u8                      freespace_initialized;
-       u8                      resize_on_mount;
-       u8                      valid;
-       u8                      btree_bitmap_shift;
-       u64                     btree_allocated_bitmap;
-};
-
-#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
deleted file mode 100644 (file)
index c4b3d8d..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SEQMUTEX_H
-#define _BCACHEFS_SEQMUTEX_H
-
-#include <linux/mutex.h>
-
-struct seqmutex {
-       struct mutex    lock;
-       u32             seq;
-};
-
-#define seqmutex_init(_lock)   mutex_init(&(_lock)->lock)
-
-static inline bool seqmutex_trylock(struct seqmutex *lock)
-{
-       return mutex_trylock(&lock->lock);
-}
-
-static inline void seqmutex_lock(struct seqmutex *lock)
-{
-       mutex_lock(&lock->lock);
-       lock->seq++;
-}
-
-static inline u32 seqmutex_unlock(struct seqmutex *lock)
-{
-       u32 seq = lock->seq;
-       mutex_unlock(&lock->lock);
-       return seq;
-}
-
-static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
-{
-       if (lock->seq != seq || !mutex_trylock(&lock->lock))
-               return false;
-
-       if (lock->seq != seq) {
-               mutex_unlock(&lock->lock);
-               return false;
-       }
-
-       return true;
-}
-
-#endif /* _BCACHEFS_SEQMUTEX_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
deleted file mode 100644 (file)
index a1cc44e..0000000
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: BSD-3-Clause
-/*     $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
-
-/*-
- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
- * are the number of compression rounds and the number of finalization rounds.
- * A compression round is identical to a finalization round and this round
- * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
- * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
- *
- * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
- * by Jean-Philippe Aumasson and Daniel J. Bernstein,
- * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
- * https://131002.net/siphash/siphash.pdf
- * https://131002.net/siphash/
- */
-
-#include <asm/byteorder.h>
-#include <linux/unaligned.h>
-#include <linux/bitops.h>
-#include <linux/string.h>
-
-#include "siphash.h"
-
-static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
-{
-       while (rounds--) {
-               ctx->v[0] += ctx->v[1];
-               ctx->v[2] += ctx->v[3];
-               ctx->v[1] = rol64(ctx->v[1], 13);
-               ctx->v[3] = rol64(ctx->v[3], 16);
-
-               ctx->v[1] ^= ctx->v[0];
-               ctx->v[3] ^= ctx->v[2];
-               ctx->v[0] = rol64(ctx->v[0], 32);
-
-               ctx->v[2] += ctx->v[1];
-               ctx->v[0] += ctx->v[3];
-               ctx->v[1] = rol64(ctx->v[1], 17);
-               ctx->v[3] = rol64(ctx->v[3], 21);
-
-               ctx->v[1] ^= ctx->v[2];
-               ctx->v[3] ^= ctx->v[0];
-               ctx->v[2] = rol64(ctx->v[2], 32);
-       }
-}
-
-static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
-{
-       u64 m = get_unaligned_le64(ptr);
-
-       ctx->v[3] ^= m;
-       SipHash_Rounds(ctx, rounds);
-       ctx->v[0] ^= m;
-}
-
-void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
-{
-       u64 k0, k1;
-
-       k0 = le64_to_cpu(key->k0);
-       k1 = le64_to_cpu(key->k1);
-
-       ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
-       ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
-       ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
-       ctx->v[3] = 0x7465646279746573ULL ^ k1;
-
-       memset(ctx->buf, 0, sizeof(ctx->buf));
-       ctx->bytes = 0;
-}
-
-void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
-                   const void *src, size_t len)
-{
-       const u8 *ptr = src;
-       size_t left, used;
-
-       if (len == 0)
-               return;
-
-       used = ctx->bytes % sizeof(ctx->buf);
-       ctx->bytes += len;
-
-       if (used > 0) {
-               left = sizeof(ctx->buf) - used;
-
-               if (len >= left) {
-                       memcpy(&ctx->buf[used], ptr, left);
-                       SipHash_CRounds(ctx, ctx->buf, rc);
-                       len -= left;
-                       ptr += left;
-               } else {
-                       memcpy(&ctx->buf[used], ptr, len);
-                       return;
-               }
-       }
-
-       while (len >= sizeof(ctx->buf)) {
-               SipHash_CRounds(ctx, ptr, rc);
-               len -= sizeof(ctx->buf);
-               ptr += sizeof(ctx->buf);
-       }
-
-       if (len > 0)
-               memcpy(&ctx->buf[used], ptr, len);
-}
-
-void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
-{
-       u64 r;
-
-       r = SipHash_End(ctx, rc, rf);
-
-       *((__le64 *) dst) = cpu_to_le64(r);
-}
-
-u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
-{
-       u64 r;
-       size_t left, used;
-
-       used = ctx->bytes % sizeof(ctx->buf);
-       left = sizeof(ctx->buf) - used;
-       memset(&ctx->buf[used], 0, left - 1);
-       ctx->buf[7] = ctx->bytes;
-
-       SipHash_CRounds(ctx, ctx->buf, rc);
-       ctx->v[2] ^= 0xff;
-       SipHash_Rounds(ctx, rf);
-
-       r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
-       memset(ctx, 0, sizeof(*ctx));
-       return r;
-}
-
-u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
-{
-       SIPHASH_CTX ctx;
-
-       SipHash_Init(&ctx, key);
-       SipHash_Update(&ctx, rc, rf, src, len);
-       return SipHash_End(&ctx, rc, rf);
-}
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
deleted file mode 100644 (file)
index 3dfaf34..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause */
-/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
-/*-
- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-/*
- * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
- * optimized for speed on short messages returning a 64bit hash/digest value.
- *
- * The number of rounds is defined during the initialization:
- *  SipHash24_Init() for the fast and resonable strong version
- *  SipHash48_Init() for the strong version (half as fast)
- *
- * struct SIPHASH_CTX ctx;
- * SipHash24_Init(&ctx);
- * SipHash_SetKey(&ctx, "16bytes long key");
- * SipHash_Update(&ctx, pointer_to_string, length_of_string);
- * SipHash_Final(output, &ctx);
- */
-
-#ifndef _SIPHASH_H_
-#define _SIPHASH_H_
-
-#include <linux/types.h>
-
-#define SIPHASH_BLOCK_LENGTH    8
-#define SIPHASH_KEY_LENGTH     16
-#define SIPHASH_DIGEST_LENGTH   8
-
-typedef struct _SIPHASH_CTX {
-       u64             v[4];
-       u8              buf[SIPHASH_BLOCK_LENGTH];
-       u32             bytes;
-} SIPHASH_CTX;
-
-typedef struct {
-       __le64          k0;
-       __le64          k1;
-} SIPHASH_KEY;
-
-void   SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
-void   SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
-u64    SipHash_End(SIPHASH_CTX *, int, int);
-void   SipHash_Final(void *, SIPHASH_CTX *, int, int);
-u64    SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
-
-#define SipHash24_Init(_c, _k)         SipHash_Init((_c), (_k))
-#define SipHash24_Update(_c, _p, _l)   SipHash_Update((_c), 2, 4, (_p), (_l))
-#define SipHash24_End(_d)              SipHash_End((_d), 2, 4)
-#define SipHash24_Final(_d, _c)                SipHash_Final((_d), (_c), 2, 4)
-#define SipHash24(_k, _p, _l)          SipHash((_k), 2, 4, (_p), (_l))
-
-#define SipHash48_Init(_c, _k)         SipHash_Init((_c), (_k))
-#define SipHash48_Update(_c, _p, _l)   SipHash_Update((_c), 4, 8, (_p), (_l))
-#define SipHash48_End(_d)              SipHash_End((_d), 4, 8)
-#define SipHash48_Final(_d, _c)                SipHash_Final((_d), (_c), 4, 8)
-#define SipHash48(_k, _p, _l)          SipHash((_k), 4, 8, (_p), (_l))
-
-#endif /* _SIPHASH_H_ */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
deleted file mode 100644 (file)
index 538c324..0000000
+++ /dev/null
@@ -1,878 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/export.h>
-#include <linux/log2.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/task.h>
-#include <linux/slab.h>
-
-#include <trace/events/lock.h>
-
-#include "six.h"
-
-#ifdef DEBUG
-#define EBUG_ON(cond)                  BUG_ON(cond)
-#else
-#define EBUG_ON(cond)                  do {} while (0)
-#endif
-
-#define six_acquire(l, t, r, ip)       lock_acquire(l, 0, t, r, 1, NULL, ip)
-#define six_release(l, ip)             lock_release(l, ip)
-
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
-
-#define SIX_LOCK_HELD_read_OFFSET      0
-#define SIX_LOCK_HELD_read             ~(~0U << 26)
-#define SIX_LOCK_HELD_intent           (1U << 26)
-#define SIX_LOCK_HELD_write            (1U << 27)
-#define SIX_LOCK_WAITING_read          (1U << (28 + SIX_LOCK_read))
-#define SIX_LOCK_WAITING_write         (1U << (28 + SIX_LOCK_write))
-#define SIX_LOCK_NOSPIN                        (1U << 31)
-
-struct six_lock_vals {
-       /* Value we add to the lock in order to take the lock: */
-       u32                     lock_val;
-
-       /* If the lock has this value (used as a mask), taking the lock fails: */
-       u32                     lock_fail;
-
-       /* Mask that indicates lock is held for this type: */
-       u32                     held_mask;
-
-       /* Waitlist we wakeup when releasing the lock: */
-       enum six_lock_type      unlock_wakeup;
-};
-
-static const struct six_lock_vals l[] = {
-       [SIX_LOCK_read] = {
-               .lock_val       = 1U << SIX_LOCK_HELD_read_OFFSET,
-               .lock_fail      = SIX_LOCK_HELD_write,
-               .held_mask      = SIX_LOCK_HELD_read,
-               .unlock_wakeup  = SIX_LOCK_write,
-       },
-       [SIX_LOCK_intent] = {
-               .lock_val       = SIX_LOCK_HELD_intent,
-               .lock_fail      = SIX_LOCK_HELD_intent,
-               .held_mask      = SIX_LOCK_HELD_intent,
-               .unlock_wakeup  = SIX_LOCK_intent,
-       },
-       [SIX_LOCK_write] = {
-               .lock_val       = SIX_LOCK_HELD_write,
-               .lock_fail      = SIX_LOCK_HELD_read,
-               .held_mask      = SIX_LOCK_HELD_write,
-               .unlock_wakeup  = SIX_LOCK_read,
-       },
-};
-
-static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
-{
-       if ((atomic_read(&lock->state) & mask) != mask)
-               atomic_or(mask, &lock->state);
-}
-
-static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
-{
-       if (atomic_read(&lock->state) & mask)
-               atomic_and(~mask, &lock->state);
-}
-
-static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-                                u32 old, struct task_struct *owner)
-{
-       if (type != SIX_LOCK_intent)
-               return;
-
-       if (!(old & SIX_LOCK_HELD_intent)) {
-               EBUG_ON(lock->owner);
-               lock->owner = owner;
-       } else {
-               EBUG_ON(lock->owner != current);
-       }
-}
-
-static inline unsigned pcpu_read_count(struct six_lock *lock)
-{
-       unsigned read_count = 0;
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               read_count += *per_cpu_ptr(lock->readers, cpu);
-       return read_count;
-}
-
-/*
- * __do_six_trylock() - main trylock routine
- *
- * Returns 1 on success, 0 on failure
- *
- * In percpu reader mode, a failed trylock may cause a spurious trylock failure
- * for anoter thread taking the competing lock type, and we may havve to do a
- * wakeup: when a wakeup is required, we return -1 - wakeup_type.
- */
-static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
-                           struct task_struct *task, bool try)
-{
-       int ret;
-       u32 old;
-
-       EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
-       EBUG_ON(type == SIX_LOCK_write &&
-               (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
-
-       /*
-        * Percpu reader mode:
-        *
-        * The basic idea behind this algorithm is that you can implement a lock
-        * between two threads without any atomics, just memory barriers:
-        *
-        * For two threads you'll need two variables, one variable for "thread a
-        * has the lock" and another for "thread b has the lock".
-        *
-        * To take the lock, a thread sets its variable indicating that it holds
-        * the lock, then issues a full memory barrier, then reads from the
-        * other thread's variable to check if the other thread thinks it has
-        * the lock. If we raced, we backoff and retry/sleep.
-        *
-        * Failure to take the lock may cause a spurious trylock failure in
-        * another thread, because we temporarily set the lock to indicate that
-        * we held it. This would be a problem for a thread in six_lock(), when
-        * they are calling trylock after adding themself to the waitlist and
-        * prior to sleeping.
-        *
-        * Therefore, if we fail to get the lock, and there were waiters of the
-        * type we conflict with, we will have to issue a wakeup.
-        *
-        * Since we may be called under wait_lock (and by the wakeup code
-        * itself), we return that the wakeup has to be done instead of doing it
-        * here.
-        */
-       if (type == SIX_LOCK_read && lock->readers) {
-               preempt_disable();
-               this_cpu_inc(*lock->readers); /* signal that we own lock */
-
-               smp_mb();
-
-               old = atomic_read(&lock->state);
-               ret = !(old & l[type].lock_fail);
-
-               this_cpu_sub(*lock->readers, !ret);
-               preempt_enable();
-
-               if (!ret) {
-                       smp_mb();
-                       if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
-                               ret = -1 - SIX_LOCK_write;
-               }
-       } else if (type == SIX_LOCK_write && lock->readers) {
-               if (try)
-                       atomic_add(SIX_LOCK_HELD_write, &lock->state);
-
-               /*
-                * Make sure atomic_add happens before pcpu_read_count and
-                * six_set_bitmask in slow path happens before pcpu_read_count.
-                *
-                * Paired with the smp_mb() in read lock fast path (per-cpu mode)
-                * and the one before atomic_read in read unlock path.
-                */
-               smp_mb();
-               ret = !pcpu_read_count(lock);
-
-               if (try && !ret) {
-                       old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
-                       if (old & SIX_LOCK_WAITING_read)
-                               ret = -1 - SIX_LOCK_read;
-               }
-       } else {
-               old = atomic_read(&lock->state);
-               do {
-                       ret = !(old & l[type].lock_fail);
-                       if (!ret || (type == SIX_LOCK_write && !try)) {
-                               smp_mb();
-                               break;
-                       }
-               } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
-
-               EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
-       }
-
-       if (ret > 0)
-               six_set_owner(lock, type, old, task);
-
-       EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
-               (atomic_read(&lock->state) & SIX_LOCK_HELD_write));
-
-       return ret;
-}
-
-static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
-{
-       struct six_lock_waiter *w, *next;
-       struct task_struct *task;
-       bool saw_one;
-       int ret;
-again:
-       ret = 0;
-       saw_one = false;
-       raw_spin_lock(&lock->wait_lock);
-
-       list_for_each_entry_safe(w, next, &lock->wait_list, list) {
-               if (w->lock_want != lock_type)
-                       continue;
-
-               if (saw_one && lock_type != SIX_LOCK_read)
-                       goto unlock;
-               saw_one = true;
-
-               ret = __do_six_trylock(lock, lock_type, w->task, false);
-               if (ret <= 0)
-                       goto unlock;
-
-               /*
-                * Similar to percpu_rwsem_wake_function(), we need to guard
-                * against the wakee noticing w->lock_acquired, returning, and
-                * then exiting before we do the wakeup:
-                */
-               task = get_task_struct(w->task);
-               __list_del(w->list.prev, w->list.next);
-               /*
-                * The release barrier here ensures the ordering of the
-                * __list_del before setting w->lock_acquired; @w is on the
-                * stack of the thread doing the waiting and will be reused
-                * after it sees w->lock_acquired with no other locking:
-                * pairs with smp_load_acquire() in six_lock_slowpath()
-                */
-               smp_store_release(&w->lock_acquired, true);
-               wake_up_process(task);
-               put_task_struct(task);
-       }
-
-       six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
-unlock:
-       raw_spin_unlock(&lock->wait_lock);
-
-       if (ret < 0) {
-               lock_type = -ret - 1;
-               goto again;
-       }
-}
-
-__always_inline
-static void six_lock_wakeup(struct six_lock *lock, u32 state,
-                           enum six_lock_type lock_type)
-{
-       if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
-               return;
-
-       if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
-               return;
-
-       __six_lock_wakeup(lock, lock_type);
-}
-
-__always_inline
-static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
-{
-       int ret;
-
-       ret = __do_six_trylock(lock, type, current, try);
-       if (ret < 0)
-               __six_lock_wakeup(lock, -ret - 1);
-
-       return ret > 0;
-}
-
-/**
- * six_trylock_ip - attempt to take a six lock without blocking
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
-       if (!do_six_trylock(lock, type, true))
-               return false;
-
-       if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-       return true;
-}
-EXPORT_SYMBOL_GPL(six_trylock_ip);
-
-/**
- * six_relock_ip - attempt to re-take a lock that was held previously
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq:       lock sequence number obtained from six_lock_seq() while lock was
- *             held previously
- * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
-                  unsigned seq, unsigned long ip)
-{
-       if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
-               return false;
-
-       if (six_lock_seq(lock) != seq) {
-               six_unlock_ip(lock, type, ip);
-               return false;
-       }
-
-       return true;
-}
-EXPORT_SYMBOL_GPL(six_relock_ip);
-
-#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
-
-static inline bool six_owner_running(struct six_lock *lock)
-{
-       /*
-        * When there's no owner, we might have preempted between the owner
-        * acquiring the lock and setting the owner field. If we're an RT task
-        * that will live-lock because we won't let the owner complete.
-        */
-       guard(rcu)();
-       struct task_struct *owner = READ_ONCE(lock->owner);
-       return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock,
-                                      struct six_lock_waiter *wait,
-                                      enum six_lock_type type)
-{
-       unsigned loop = 0;
-       u64 end_time;
-
-       if (type == SIX_LOCK_write)
-               return false;
-
-       if (lock->wait_list.next != &wait->list)
-               return false;
-
-       if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN)
-               return false;
-
-       preempt_disable();
-       end_time = sched_clock() + 10 * NSEC_PER_USEC;
-
-       while (!need_resched() && six_owner_running(lock)) {
-               /*
-                * Ensures that writes to the waitlist entry happen after we see
-                * wait->lock_acquired: pairs with the smp_store_release in
-                * __six_lock_wakeup
-                */
-               if (smp_load_acquire(&wait->lock_acquired)) {
-                       preempt_enable();
-                       return true;
-               }
-
-               if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
-                       six_set_bitmask(lock, SIX_LOCK_NOSPIN);
-                       break;
-               }
-
-               /*
-                * The cpu_relax() call is a compiler barrier which forces
-                * everything in this loop to be re-loaded. We don't need
-                * memory barriers as we'll eventually observe the right
-                * values at the cost of a few extra spins.
-                */
-               cpu_relax();
-       }
-
-       preempt_enable();
-       return false;
-}
-
-#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-
-static inline bool six_optimistic_spin(struct six_lock *lock,
-                                      struct six_lock_waiter *wait,
-                                      enum six_lock_type type)
-{
-       return false;
-}
-
-#endif
-
-noinline
-static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
-                            struct six_lock_waiter *wait,
-                            six_lock_should_sleep_fn should_sleep_fn, void *p,
-                            unsigned long ip)
-{
-       int ret = 0;
-
-       if (type == SIX_LOCK_write) {
-               EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-               atomic_add(SIX_LOCK_HELD_write, &lock->state);
-               smp_mb__after_atomic();
-       }
-
-       trace_contention_begin(lock, 0);
-       lock_contended(&lock->dep_map, ip);
-
-       wait->task              = current;
-       wait->lock_want         = type;
-       wait->lock_acquired     = false;
-
-       raw_spin_lock(&lock->wait_lock);
-       six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
-       /*
-        * Retry taking the lock after taking waitlist lock, in case we raced
-        * with an unlock:
-        */
-       ret = __do_six_trylock(lock, type, current, false);
-       if (ret <= 0) {
-               wait->start_time = local_clock();
-
-               if (!list_empty(&lock->wait_list)) {
-                       struct six_lock_waiter *last =
-                               list_last_entry(&lock->wait_list,
-                                       struct six_lock_waiter, list);
-
-                       if (time_before_eq64(wait->start_time, last->start_time))
-                               wait->start_time = last->start_time + 1;
-               }
-
-               list_add_tail(&wait->list, &lock->wait_list);
-       }
-       raw_spin_unlock(&lock->wait_lock);
-
-       if (unlikely(ret > 0)) {
-               ret = 0;
-               goto out;
-       }
-
-       if (unlikely(ret < 0)) {
-               __six_lock_wakeup(lock, -ret - 1);
-               ret = 0;
-       }
-
-       if (six_optimistic_spin(lock, wait, type))
-               goto out;
-
-       while (1) {
-               set_current_state(TASK_UNINTERRUPTIBLE);
-
-               /*
-                * Ensures that writes to the waitlist entry happen after we see
-                * wait->lock_acquired: pairs with the smp_store_release in
-                * __six_lock_wakeup
-                */
-               if (smp_load_acquire(&wait->lock_acquired))
-                       break;
-
-               ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
-               if (unlikely(ret)) {
-                       bool acquired;
-
-                       /*
-                        * If should_sleep_fn() returns an error, we are
-                        * required to return that error even if we already
-                        * acquired the lock - should_sleep_fn() might have
-                        * modified external state (e.g. when the deadlock cycle
-                        * detector in bcachefs issued a transaction restart)
-                        */
-                       raw_spin_lock(&lock->wait_lock);
-                       acquired = wait->lock_acquired;
-                       if (!acquired)
-                               list_del(&wait->list);
-                       raw_spin_unlock(&lock->wait_lock);
-
-                       if (unlikely(acquired)) {
-                               do_six_unlock_type(lock, type);
-                       } else if (type == SIX_LOCK_write) {
-                               six_clear_bitmask(lock, SIX_LOCK_HELD_write);
-                               six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
-                       }
-                       break;
-               }
-
-               schedule();
-       }
-
-       __set_current_state(TASK_RUNNING);
-out:
-       trace_contention_end(lock, 0);
-
-       return ret;
-}
-
-/**
- * six_lock_ip_waiter - take a lock, with full waitlist interface
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait:      pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *             to scheduling
- * @p:         passed through to @should_sleep_fn
- * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * This is the most general six_lock() variant, with parameters to support full
- * cycle detection for deadlock avoidance.
- *
- * The code calling this function must implement tracking of held locks, and the
- * @wait object should be embedded into the struct that tracks held locks -
- * which must also be accessible in a thread-safe way.
- *
- * @should_sleep_fn should invoke the cycle detector; it should walk each
- * lock's waiters, and for each waiter recursively walk their held locks.
- *
- * When this function must block, @wait will be added to @lock's waitlist before
- * calling trylock, and before calling @should_sleep_fn, and @wait will not be
- * removed from the lock waitlist until the lock has been successfully acquired,
- * or we abort.
- *
- * @wait.start_time will be monotonically increasing for any given waitlist, and
- * thus may be used as a loop cursor.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-                      struct six_lock_waiter *wait,
-                      six_lock_should_sleep_fn should_sleep_fn, void *p,
-                      unsigned long ip)
-{
-       int ret;
-
-       wait->start_time = 0;
-
-       if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
-
-       ret = do_six_trylock(lock, type, true) ? 0
-               : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
-
-       if (ret && type != SIX_LOCK_write)
-               six_release(&lock->dep_map, ip);
-       if (!ret)
-               lock_acquired(&lock->dep_map, ip);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
-
-__always_inline
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
-       u32 state;
-
-       if (type == SIX_LOCK_intent)
-               lock->owner = NULL;
-
-       if (type == SIX_LOCK_read &&
-           lock->readers) {
-               smp_mb(); /* unlock barrier */
-               this_cpu_dec(*lock->readers);
-               smp_mb(); /* between unlocking and checking for waiters */
-               state = atomic_read(&lock->state);
-       } else {
-               u32 v = l[type].lock_val;
-
-               if (type != SIX_LOCK_read)
-                       v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
-
-               EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
-               state = atomic_sub_return_release(v, &lock->state);
-       }
-
-       six_lock_wakeup(lock, state, l[type].unlock_wakeup);
-}
-
-/**
- * six_unlock_ip - drop a six lock
- * @lock:      lock to unlock
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock);                          read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read);      read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 0
- */
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
-       EBUG_ON(type == SIX_LOCK_write &&
-               !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
-       EBUG_ON((type == SIX_LOCK_write ||
-                type == SIX_LOCK_intent) &&
-               lock->owner != current);
-
-       if (type != SIX_LOCK_write)
-               six_release(&lock->dep_map, ip);
-
-       if (type == SIX_LOCK_intent &&
-           lock->intent_lock_recurse) {
-               --lock->intent_lock_recurse;
-               return;
-       }
-
-       if (type == SIX_LOCK_write &&
-           lock->write_lock_recurse) {
-               --lock->write_lock_recurse;
-               return;
-       }
-
-       if (type == SIX_LOCK_write)
-               lock->seq++;
-
-       do_six_unlock_type(lock, type);
-}
-EXPORT_SYMBOL_GPL(six_unlock_ip);
-
-/**
- * six_lock_downgrade - convert an intent lock to a read lock
- * @lock:      lock to dowgrade
- *
- * @lock will have read count incremented and intent count decremented
- */
-void six_lock_downgrade(struct six_lock *lock)
-{
-       six_lock_increment(lock, SIX_LOCK_read);
-       six_unlock_intent(lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_downgrade);
-
-/**
- * six_lock_tryupgrade - attempt to convert read lock to an intent lock
- * @lock:      lock to upgrade
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_lock_tryupgrade(struct six_lock *lock)
-{
-       u32 old = atomic_read(&lock->state), new;
-
-       do {
-               new = old;
-
-               if (new & SIX_LOCK_HELD_intent)
-                       return false;
-
-               if (!lock->readers) {
-                       EBUG_ON(!(new & SIX_LOCK_HELD_read));
-                       new -= l[SIX_LOCK_read].lock_val;
-               }
-
-               new |= SIX_LOCK_HELD_intent;
-       } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
-
-       if (lock->readers)
-               this_cpu_dec(*lock->readers);
-
-       six_set_owner(lock, SIX_LOCK_intent, old, current);
-
-       return true;
-}
-EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
-
-/**
- * six_trylock_convert - attempt to convert a held lock from one type to another
- * @lock:      lock to upgrade
- * @from:      SIX_LOCK_read or SIX_LOCK_intent
- * @to:                SIX_LOCK_read or SIX_LOCK_intent
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_trylock_convert(struct six_lock *lock,
-                        enum six_lock_type from,
-                        enum six_lock_type to)
-{
-       EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
-
-       if (to == from)
-               return true;
-
-       if (to == SIX_LOCK_read) {
-               six_lock_downgrade(lock);
-               return true;
-       } else {
-               return six_lock_tryupgrade(lock);
-       }
-}
-EXPORT_SYMBOL_GPL(six_trylock_convert);
-
-/**
- * six_lock_increment - increase held lock count on a lock that is already held
- * @lock:      lock to increment
- * @type:      SIX_LOCK_read or SIX_LOCK_intent
- *
- * @lock must already be held, with a lock type that is greater than or equal to
- * @type
- *
- * A corresponding six_unlock_type() call will be required for @lock to be fully
- * unlocked.
- */
-void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
-{
-       six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
-
-       /* XXX: assert already locked, and that we don't overflow: */
-
-       switch (type) {
-       case SIX_LOCK_read:
-               if (lock->readers) {
-                       this_cpu_inc(*lock->readers);
-               } else {
-                       EBUG_ON(!(atomic_read(&lock->state) &
-                                 (SIX_LOCK_HELD_read|
-                                  SIX_LOCK_HELD_intent)));
-                       atomic_add(l[type].lock_val, &lock->state);
-               }
-               break;
-       case SIX_LOCK_write:
-               lock->write_lock_recurse++;
-               fallthrough;
-       case SIX_LOCK_intent:
-               EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
-               lock->intent_lock_recurse++;
-               break;
-       }
-}
-EXPORT_SYMBOL_GPL(six_lock_increment);
-
-/**
- * six_lock_wakeup_all - wake up all waiters on @lock
- * @lock:      lock to wake up waiters for
- *
- * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
- * abort the lock operation.
- *
- * This function is never needed in a bug-free program; it's only useful in
- * debug code, e.g. to determine if a cycle detector is at fault.
- */
-void six_lock_wakeup_all(struct six_lock *lock)
-{
-       u32 state = atomic_read(&lock->state);
-       struct six_lock_waiter *w;
-
-       six_lock_wakeup(lock, state, SIX_LOCK_read);
-       six_lock_wakeup(lock, state, SIX_LOCK_intent);
-       six_lock_wakeup(lock, state, SIX_LOCK_write);
-
-       raw_spin_lock(&lock->wait_lock);
-       list_for_each_entry(w, &lock->wait_list, list)
-               wake_up_process(w->task);
-       raw_spin_unlock(&lock->wait_lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
-
-/**
- * six_lock_counts - return held lock counts, for each lock type
- * @lock:      lock to return counters for
- *
- * Return: the number of times a lock is held for read, intent and write.
- */
-struct six_lock_count six_lock_counts(struct six_lock *lock)
-{
-       struct six_lock_count ret;
-
-       ret.n[SIX_LOCK_read]    = !lock->readers
-               ? atomic_read(&lock->state) & SIX_LOCK_HELD_read
-               : pcpu_read_count(lock);
-       ret.n[SIX_LOCK_intent]  = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
-               lock->intent_lock_recurse;
-       ret.n[SIX_LOCK_write]   = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_counts);
-
-/**
- * six_lock_readers_add - directly manipulate reader count of a lock
- * @lock:      lock to add/subtract readers for
- * @nr:                reader count to add/subtract
- *
- * When an upper layer is implementing lock reentrency, we may have both read
- * and intent locks on the same lock.
- *
- * When we need to take a write lock, the read locks will cause self-deadlock,
- * because six locks themselves do not track which read locks are held by the
- * current thread and which are held by a different thread - it does no
- * per-thread tracking of held locks.
- *
- * The upper layer that is tracking held locks may however, if trylock() has
- * failed, count up its own read locks, subtract them, take the write lock, and
- * then re-add them.
- *
- * As in any other situation when taking a write lock, @lock must be held for
- * intent one (or more) times, so @lock will never be left unlocked.
- */
-void six_lock_readers_add(struct six_lock *lock, int nr)
-{
-       if (lock->readers) {
-               this_cpu_add(*lock->readers, nr);
-       } else {
-               EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
-               /* reader count starts at bit 0 */
-               atomic_add(nr, &lock->state);
-       }
-}
-EXPORT_SYMBOL_GPL(six_lock_readers_add);
-
-/**
- * six_lock_exit - release resources held by a lock prior to freeing
- * @lock:      lock to exit
- *
- * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
- * required to free the percpu read counts.
- */
-void six_lock_exit(struct six_lock *lock)
-{
-       WARN_ON(lock->readers && pcpu_read_count(lock));
-       WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
-
-       free_percpu(lock->readers);
-       lock->readers = NULL;
-}
-EXPORT_SYMBOL_GPL(six_lock_exit);
-
-void __six_lock_init(struct six_lock *lock, const char *name,
-                    struct lock_class_key *key, enum six_lock_init_flags flags,
-                    gfp_t gfp)
-{
-       atomic_set(&lock->state, 0);
-       raw_spin_lock_init(&lock->wait_lock);
-       INIT_LIST_HEAD(&lock->wait_list);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       debug_check_no_locks_freed((void *) lock, sizeof(*lock));
-       lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-
-       /*
-        * Don't assume that we have real percpu variables available in
-        * userspace:
-        */
-#ifdef __KERNEL__
-       if (flags & SIX_LOCK_INIT_PCPU) {
-               /*
-                * We don't return an error here on memory allocation failure
-                * since percpu is an optimization, and locks will work with the
-                * same semantics in non-percpu mode: callers can check for
-                * failure if they wish by checking lock->readers, but generally
-                * will not want to treat it as an error.
-                */
-               lock->readers = alloc_percpu_gfp(unsigned, gfp);
-       }
-#endif
-}
-EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
deleted file mode 100644 (file)
index 59b851c..0000000
+++ /dev/null
@@ -1,388 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _LINUX_SIX_H
-#define _LINUX_SIX_H
-
-/**
- * DOC: SIX locks overview
- *
- * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
- * but with an additional state: read/shared, intent, exclusive/write
- *
- * The purpose of the intent state is to allow for greater concurrency on tree
- * structures without deadlocking. In general, a read can't be upgraded to a
- * write lock without deadlocking, so an operation that updates multiple nodes
- * will have to take write locks for the full duration of the operation.
- *
- * But by adding an intent state, which is exclusive with other intent locks but
- * not with readers, we can take intent locks at the start of the operation,
- * and then take write locks only for the actual update to each individual
- * nodes, without deadlocking.
- *
- * Example usage:
- *   six_lock_read(&foo->lock);
- *   six_unlock_read(&foo->lock);
- *
- * An intent lock must be held before taking a write lock:
- *   six_lock_intent(&foo->lock);
- *   six_lock_write(&foo->lock);
- *   six_unlock_write(&foo->lock);
- *   six_unlock_intent(&foo->lock);
- *
- * Other operations:
- *   six_trylock_read()
- *   six_trylock_intent()
- *   six_trylock_write()
- *
- *   six_lock_downgrade()      convert from intent to read
- *   six_lock_tryupgrade()     attempt to convert from read to intent, may fail
- *
- * There are also interfaces that take the lock type as an enum:
- *
- *   six_lock_type(&foo->lock, SIX_LOCK_read);
- *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
- *   six_lock_type(&foo->lock, SIX_LOCK_write);
- *   six_unlock_type(&foo->lock, SIX_LOCK_write);
- *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
- *
- * Lock sequence numbers - unlock(), relock():
- *
- *   Locks embed sequences numbers, which are incremented on write lock/unlock.
- *   This allows locks to be dropped and the retaken iff the state they protect
- *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
- *   doing IO or allocating memory.
- *
- *   Example usage:
- *     six_lock_read(&foo->lock);
- *     u32 seq = six_lock_seq(&foo->lock);
- *     six_unlock_read(&foo->lock);
- *
- *     some_operation_that_may_block();
- *
- *     if (six_relock_read(&foo->lock, seq)) { ... }
- *
- *   If the relock operation succeeds, it is as if the lock was never unlocked.
- *
- * Reentrancy:
- *
- *   Six locks are not by themselves reentrant, but have counters for both the
- *   read and intent states that can be used to provide reentrancy by an upper
- *   layer that tracks held locks. If a lock is known to already be held in the
- *   read or intent state, six_lock_increment() can be used to bump the "lock
- *   held in this state" counter, increasing the number of unlock calls that
- *   will be required to fully unlock it.
- *
- *   Example usage:
- *     six_lock_read(&foo->lock);
- *     six_lock_increment(&foo->lock, SIX_LOCK_read);
- *     six_unlock_read(&foo->lock);
- *     six_unlock_read(&foo->lock);
- *   foo->lock is now fully unlocked.
- *
- *   Since the intent state supercedes read, it's legal to increment the read
- *   counter when holding an intent lock, but not the reverse.
- *
- *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
- *   is not legal.
- *
- * should_sleep_fn:
- *
- *   There is a six_lock() variant that takes a function pointer that is called
- *   immediately prior to schedule() when blocking, and may return an error to
- *   abort.
- *
- *   One possible use for this feature is when objects being locked are part of
- *   a cache and may reused, and lock ordering is based on a property of the
- *   object that will change when the object is reused - i.e. logical key order.
- *
- *   If looking up an object in the cache may race with object reuse, and lock
- *   ordering is required to prevent deadlock, object reuse may change the
- *   correct lock order for that object and cause a deadlock. should_sleep_fn
- *   can be used to check if the object is still the object we want and avoid
- *   this deadlock.
- *
- * Wait list entry interface:
- *
- *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
- *   wait list entry. By embedding six_lock_waiter into another object, and by
- *   traversing lock waitlists, it is then possible for an upper layer to
- *   implement full cycle detection for deadlock avoidance.
- *
- *   should_sleep_fn should be used for invoking the cycle detector, walking the
- *   graph of held locks to check for a deadlock. The upper layer must track
- *   held locks for each thread, and each thread's held locks must be reachable
- *   from its six_lock_waiter object.
- *
- *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
- *   the lock, and before calling should_sleep_fn, and the wait object will not
- *   be removed from the waitlist until either the lock has been successfully
- *   acquired, or we aborted because should_sleep_fn returned an error.
- *
- *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
- *   have timestamps in strictly ascending order - this is so the timestamp can
- *   be used as a cursor for lock graph traverse.
- */
-
-#include <linux/lockdep.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-enum six_lock_type {
-       SIX_LOCK_read,
-       SIX_LOCK_intent,
-       SIX_LOCK_write,
-};
-
-struct six_lock {
-       atomic_t                state;
-       u32                     seq;
-       unsigned                intent_lock_recurse;
-       unsigned                write_lock_recurse;
-       struct task_struct      *owner;
-       unsigned __percpu       *readers;
-       raw_spinlock_t          wait_lock;
-       struct list_head        wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lockdep_map      dep_map;
-#endif
-};
-
-struct six_lock_waiter {
-       struct list_head        list;
-       struct task_struct      *task;
-       enum six_lock_type      lock_want;
-       bool                    lock_acquired;
-       u64                     start_time;
-};
-
-typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
-
-void six_lock_exit(struct six_lock *lock);
-
-enum six_lock_init_flags {
-       SIX_LOCK_INIT_PCPU      = 1U << 0,
-};
-
-void __six_lock_init(struct six_lock *lock, const char *name,
-                    struct lock_class_key *key, enum six_lock_init_flags flags,
-                    gfp_t gfp);
-
-/**
- * six_lock_init - initialize a six lock
- * @lock:      lock to initialize
- * @flags:     optional flags, i.e. SIX_LOCK_INIT_PCPU
- */
-#define six_lock_init(lock, flags, gfp)                                        \
-do {                                                                   \
-       static struct lock_class_key __key;                             \
-                                                                       \
-       __six_lock_init((lock), #lock, &__key, flags, gfp);                     \
-} while (0)
-
-/**
- * six_lock_seq - obtain current lock sequence number
- * @lock:      six_lock to obtain sequence number for
- *
- * @lock should be held for read or intent, and not write
- *
- * By saving the lock sequence number, we can unlock @lock and then (typically
- * after some blocking operation) attempt to relock it: the relock will succeed
- * if the sequence number hasn't changed, meaning no write locks have been taken
- * and state corresponding to what @lock protects is still valid.
- */
-static inline u32 six_lock_seq(const struct six_lock *lock)
-{
-       return lock->seq;
-}
-
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_trylock_type - attempt to take a six lock without blocking
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
-       return six_trylock_ip(lock, type, _THIS_IP_);
-}
-
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-                      struct six_lock_waiter *wait,
-                      six_lock_should_sleep_fn should_sleep_fn, void *p,
-                      unsigned long ip);
-
-/**
- * six_lock_waiter - take a lock, with full waitlist interface
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait:      pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *             to scheduling
- * @p:         passed through to @should_sleep_fn
- *
- * This is a convenience wrapper around six_lock_ip_waiter(), see that function
- * for full documentation.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
-                                 struct six_lock_waiter *wait,
-                                 six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-       return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-/**
- * six_lock_ip - take a six lock lock
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *             to scheduling
- * @p:         passed through to @should_sleep_fn
- * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
-                             six_lock_should_sleep_fn should_sleep_fn, void *p,
-                             unsigned long ip)
-{
-       struct six_lock_waiter wait;
-
-       return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
-}
-
-/**
- * six_lock_type - take a six lock lock
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *             to scheduling
- * @p:         passed through to @should_sleep_fn
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
-                               six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-       struct six_lock_waiter wait;
-
-       return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
-                  unsigned seq, unsigned long ip);
-
-/**
- * six_relock_type - attempt to re-take a lock that was held previously
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq:       lock sequence number obtained from six_lock_seq() while lock was
- *             held previously
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
-                                  unsigned seq)
-{
-       return six_relock_ip(lock, type, seq, _THIS_IP_);
-}
-
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_unlock_type - drop a six lock
- * @lock:      lock to unlock
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock);                          read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read);      read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 0
- */
-static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
-       six_unlock_ip(lock, type, _THIS_IP_);
-}
-
-#define __SIX_LOCK(type)                                               \
-static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
-{                                                                      \
-       return six_trylock_ip(lock, SIX_LOCK_##type, ip);               \
-}                                                                      \
-                                                                       \
-static inline bool six_trylock_##type(struct six_lock *lock)           \
-{                                                                      \
-       return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);        \
-}                                                                      \
-                                                                       \
-static inline int six_lock_ip_waiter_##type(struct six_lock *lock,     \
-                          struct six_lock_waiter *wait,                \
-                          six_lock_should_sleep_fn should_sleep_fn, void *p,\
-                          unsigned long ip)                            \
-{                                                                      \
-       return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
-}                                                                      \
-                                                                       \
-static inline int six_lock_ip_##type(struct six_lock *lock,            \
-                   six_lock_should_sleep_fn should_sleep_fn, void *p,  \
-                   unsigned long ip)                                   \
-{                                                                      \
-       return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
-}                                                                      \
-                                                                       \
-static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
-{                                                                      \
-       return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);           \
-}                                                                      \
-                                                                       \
-static inline bool six_relock_##type(struct six_lock *lock, u32 seq)   \
-{                                                                      \
-       return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);    \
-}                                                                      \
-                                                                       \
-static inline int six_lock_##type(struct six_lock *lock,               \
-                                 six_lock_should_sleep_fn fn, void *p)\
-{                                                                      \
-       return six_lock_ip_##type(lock, fn, p, _THIS_IP_);              \
-}                                                                      \
-                                                                       \
-static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)       \
-{                                                                      \
-       six_unlock_ip(lock, SIX_LOCK_##type, ip);                       \
-}                                                                      \
-                                                                       \
-static inline void six_unlock_##type(struct six_lock *lock)            \
-{                                                                      \
-       six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);                \
-}
-
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-#undef __SIX_LOCK
-
-void six_lock_downgrade(struct six_lock *);
-bool six_lock_tryupgrade(struct six_lock *);
-bool six_trylock_convert(struct six_lock *, enum six_lock_type,
-                        enum six_lock_type);
-
-void six_lock_increment(struct six_lock *, enum six_lock_type);
-
-void six_lock_wakeup_all(struct six_lock *);
-
-struct six_lock_count {
-       unsigned n[3];
-};
-
-struct six_lock_count six_lock_counts(struct six_lock *);
-void six_lock_readers_add(struct six_lock *, int);
-
-#endif /* _LINUX_SIX_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
deleted file mode 100644 (file)
index 4c43d2a..0000000
+++ /dev/null
@@ -1,2043 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-
-#include <linux/random.h>
-
-/*
- * Snapshot trees:
- *
- * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
- * exist to provide a stable identifier for the whole lifetime of a snapshot
- * tree.
- */
-
-void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
-                               struct bkey_s_c k)
-{
-       struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
-
-       prt_printf(out, "subvol %u root snapshot %u",
-                  le32_to_cpu(t.v->master_subvol),
-                  le32_to_cpu(t.v->root_snapshot));
-}
-
-int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
-                               struct bkey_validate_context from)
-{
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-                        bkey_lt(k.k->p, POS(0, 1)),
-                        c, snapshot_tree_pos_bad,
-                        "bad pos");
-fsck_err:
-       return ret;
-}
-
-int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
-                             struct bch_snapshot_tree *s)
-{
-       int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
-                                         BTREE_ITER_with_updates, snapshot_tree, s);
-
-       if (bch2_err_matches(ret, ENOENT))
-               ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
-       return ret;
-}
-
-struct bkey_i_snapshot_tree *
-__bch2_snapshot_tree_create(struct btree_trans *trans)
-{
-       struct btree_iter iter;
-       int ret = bch2_bkey_get_empty_slot(trans, &iter,
-                       BTREE_ID_snapshot_trees, POS(0, U32_MAX));
-       struct bkey_i_snapshot_tree *s_t;
-
-       if (ret == -BCH_ERR_ENOSPC_btree_slot)
-               ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
-       if (ret)
-               return ERR_PTR(ret);
-
-       s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
-       ret = PTR_ERR_OR_ZERO(s_t);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret ? ERR_PTR(ret) : s_t;
-}
-
-static int bch2_snapshot_tree_create(struct btree_trans *trans,
-                               u32 root_id, u32 subvol_id, u32 *tree_id)
-{
-       struct bkey_i_snapshot_tree *n_tree =
-               __bch2_snapshot_tree_create(trans);
-
-       if (IS_ERR(n_tree))
-               return PTR_ERR(n_tree);
-
-       n_tree->v.master_subvol = cpu_to_le32(subvol_id);
-       n_tree->v.root_snapshot = cpu_to_le32(root_id);
-       *tree_id = n_tree->k.p.offset;
-       return 0;
-}
-
-/* Snapshot nodes: */
-
-static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
-{
-       while (id && id < ancestor) {
-               const struct snapshot_t *s = __snapshot_t(t, id);
-               id = s ? s->parent : 0;
-       }
-       return id == ancestor;
-}
-
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
-{
-       guard(rcu)();
-       return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
-}
-
-static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
-{
-       const struct snapshot_t *s = __snapshot_t(t, id);
-       if (!s)
-               return 0;
-
-       if (s->skip[2] <= ancestor)
-               return s->skip[2];
-       if (s->skip[1] <= ancestor)
-               return s->skip[1];
-       if (s->skip[0] <= ancestor)
-               return s->skip[0];
-       return s->parent;
-}
-
-static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
-{
-       const struct snapshot_t *s = __snapshot_t(t, id);
-       if (!s)
-               return false;
-
-       return test_bit(ancestor - id - 1, s->is_ancestor);
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       u32 orig_id = id;
-#endif
-
-       guard(rcu)();
-       struct snapshot_table *t = rcu_dereference(c->snapshots);
-
-       if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
-               return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
-
-       if (likely(ancestor >= IS_ANCESTOR_BITMAP))
-               while (id && id < ancestor - IS_ANCESTOR_BITMAP)
-                       id = get_ancestor_below(t, id, ancestor);
-
-       bool ret = id && id < ancestor
-               ? test_ancestor_bitmap(t, id, ancestor)
-               : id == ancestor;
-
-       EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor));
-       return ret;
-}
-
-static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
-{
-       size_t idx = U32_MAX - id;
-       struct snapshot_table *new, *old;
-
-       size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
-       size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
-
-       if (unlikely(new_bytes > INT_MAX))
-               return NULL;
-
-       new = kvzalloc(new_bytes, GFP_KERNEL);
-       if (!new)
-               return NULL;
-
-       new->nr = new_size;
-
-       old = rcu_dereference_protected(c->snapshots, true);
-       if (old)
-               memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
-
-       rcu_assign_pointer(c->snapshots, new);
-       kvfree_rcu(old, rcu);
-
-       return &rcu_dereference_protected(c->snapshots,
-                               lockdep_is_held(&c->snapshot_table_lock))->s[idx];
-}
-
-static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
-{
-       size_t idx = U32_MAX - id;
-       struct snapshot_table *table =
-               rcu_dereference_protected(c->snapshots,
-                               lockdep_is_held(&c->snapshot_table_lock));
-
-       lockdep_assert_held(&c->snapshot_table_lock);
-
-       if (likely(table && idx < table->nr))
-               return &table->s[idx];
-
-       return __snapshot_t_mut(c, id);
-}
-
-void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
-                          struct bkey_s_c k)
-{
-       struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-
-       if (BCH_SNAPSHOT_SUBVOL(s.v))
-               prt_str(out, "subvol ");
-       if (BCH_SNAPSHOT_WILL_DELETE(s.v))
-               prt_str(out, "will_delete ");
-       if (BCH_SNAPSHOT_DELETED(s.v))
-               prt_str(out, "deleted ");
-
-       prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u",
-              le32_to_cpu(s.v->parent),
-              le32_to_cpu(s.v->children[0]),
-              le32_to_cpu(s.v->children[1]),
-              le32_to_cpu(s.v->subvol),
-              le32_to_cpu(s.v->tree));
-
-       if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
-               prt_printf(out, " depth %u skiplist %u %u %u",
-                          le32_to_cpu(s.v->depth),
-                          le32_to_cpu(s.v->skip[0]),
-                          le32_to_cpu(s.v->skip[1]),
-                          le32_to_cpu(s.v->skip[2]));
-}
-
-int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
-                          struct bkey_validate_context from)
-{
-       struct bkey_s_c_snapshot s;
-       u32 i, id;
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-                        bkey_lt(k.k->p, POS(0, 1)),
-                        c, snapshot_pos_bad,
-                        "bad pos");
-
-       s = bkey_s_c_to_snapshot(k);
-
-       id = le32_to_cpu(s.v->parent);
-       bkey_fsck_err_on(id && id <= k.k->p.offset,
-                        c, snapshot_parent_bad,
-                        "bad parent node (%u <= %llu)",
-                        id, k.k->p.offset);
-
-       bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
-                        c, snapshot_children_not_normalized,
-                        "children not normalized");
-
-       bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
-                        c, snapshot_child_duplicate,
-                        "duplicate child nodes");
-
-       for (i = 0; i < 2; i++) {
-               id = le32_to_cpu(s.v->children[i]);
-
-               bkey_fsck_err_on(id >= k.k->p.offset,
-                                c, snapshot_child_bad,
-                                "bad child node (%u >= %llu)",
-                                id, k.k->p.offset);
-       }
-
-       if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
-               bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
-                                le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
-                                c, snapshot_skiplist_not_normalized,
-                                "skiplist not normalized");
-
-               for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
-                       id = le32_to_cpu(s.v->skip[i]);
-
-                       bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
-                                        c, snapshot_skiplist_bad,
-                                        "bad skiplist node %u", id);
-               }
-       }
-fsck_err:
-       return ret;
-}
-
-static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
-{
-       mutex_lock(&c->snapshot_table_lock);
-       int ret = snapshot_t_mut(c, id)
-               ? 0
-               : bch_err_throw(c, ENOMEM_mark_snapshot);
-       mutex_unlock(&c->snapshot_table_lock);
-       return ret;
-}
-
-static int __bch2_mark_snapshot(struct btree_trans *trans,
-                      enum btree_id btree, unsigned level,
-                      struct bkey_s_c old, struct bkey_s_c new,
-                      enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       struct snapshot_t *t;
-       u32 id = new.k->p.offset;
-       int ret = 0;
-
-       mutex_lock(&c->snapshot_table_lock);
-
-       t = snapshot_t_mut(c, id);
-       if (!t) {
-               ret = bch_err_throw(c, ENOMEM_mark_snapshot);
-               goto err;
-       }
-
-       if (new.k->type == KEY_TYPE_snapshot) {
-               struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
-
-               t->state        = !BCH_SNAPSHOT_DELETED(s.v)
-                       ? SNAPSHOT_ID_live
-                       : SNAPSHOT_ID_deleted;
-               t->parent       = le32_to_cpu(s.v->parent);
-               t->children[0]  = le32_to_cpu(s.v->children[0]);
-               t->children[1]  = le32_to_cpu(s.v->children[1]);
-               t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
-               t->tree         = le32_to_cpu(s.v->tree);
-
-               if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
-                       t->depth        = le32_to_cpu(s.v->depth);
-                       t->skip[0]      = le32_to_cpu(s.v->skip[0]);
-                       t->skip[1]      = le32_to_cpu(s.v->skip[1]);
-                       t->skip[2]      = le32_to_cpu(s.v->skip[2]);
-               } else {
-                       t->depth        = 0;
-                       t->skip[0]      = 0;
-                       t->skip[1]      = 0;
-                       t->skip[2]      = 0;
-               }
-
-               u32 parent = id;
-
-               while ((parent = bch2_snapshot_parent_early(c, parent)) &&
-                      parent - id - 1 < IS_ANCESTOR_BITMAP)
-                       __set_bit(parent - id - 1, t->is_ancestor);
-
-               if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
-                       set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
-                       if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots)
-                               bch2_delete_dead_snapshots_async(c);
-               }
-       } else {
-               memset(t, 0, sizeof(*t));
-       }
-err:
-       mutex_unlock(&c->snapshot_table_lock);
-       return ret;
-}
-
-int bch2_mark_snapshot(struct btree_trans *trans,
-                      enum btree_id btree, unsigned level,
-                      struct bkey_s_c old, struct bkey_s new,
-                      enum btree_iter_update_trigger_flags flags)
-{
-       return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
-}
-
-int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
-                        struct bch_snapshot *s)
-{
-       return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
-                                      BTREE_ITER_with_updates, snapshot, s);
-}
-
-/* fsck: */
-
-static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
-{
-       return snapshot_t(c, id)->children[child];
-}
-
-static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
-{
-       return bch2_snapshot_child(c, id, 0);
-}
-
-static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
-{
-       return bch2_snapshot_child(c, id, 1);
-}
-
-static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
-{
-       u32 n, parent;
-
-       n = bch2_snapshot_left_child(c, id);
-       if (n)
-               return n;
-
-       while ((parent = bch2_snapshot_parent(c, id))) {
-               n = bch2_snapshot_right_child(c, parent);
-               if (n && n != id)
-                       return n;
-               id = parent;
-       }
-
-       return 0;
-}
-
-u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
-                               snapshot_id_list *skip)
-{
-       guard(rcu)();
-       u32 id, subvol = 0, s;
-retry:
-       id = snapshot_root;
-       while (id && bch2_snapshot_exists(c, id)) {
-               if (!(skip && snapshot_list_has_id(skip, id))) {
-                       s = snapshot_t(c, id)->subvol;
-
-                       if (s && (!subvol || s < subvol))
-                               subvol = s;
-               }
-               id = bch2_snapshot_tree_next(c, id);
-               if (id == snapshot_root)
-                       break;
-       }
-
-       if (!subvol && skip) {
-               skip = NULL;
-               goto retry;
-       }
-
-       return subvol;
-}
-
-static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
-                                           u32 snapshot_root, u32 *subvol_id)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       bool found = false;
-       int ret;
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
-                                    0, k, ret) {
-               if (k.k->type != KEY_TYPE_subvolume)
-                       continue;
-
-               struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-               if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
-                       continue;
-               if (!BCH_SUBVOLUME_SNAP(s.v)) {
-                       *subvol_id = s.k->p.offset;
-                       found = true;
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (!ret && !found) {
-               struct bkey_i_subvolume *u;
-
-               *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL);
-
-               u = bch2_bkey_get_mut_typed(trans, &iter,
-                                           BTREE_ID_subvolumes, POS(0, *subvol_id),
-                                           0, subvolume);
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       return ret;
-
-               SET_BCH_SUBVOLUME_SNAP(&u->v, false);
-       }
-
-       return ret;
-}
-
-static int check_snapshot_tree(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_s_c_snapshot_tree st;
-       struct bch_snapshot s;
-       struct bch_subvolume subvol;
-       struct printbuf buf = PRINTBUF;
-       struct btree_iter snapshot_iter = {};
-       u32 root_id;
-       int ret;
-
-       if (k.k->type != KEY_TYPE_snapshot_tree)
-               return 0;
-
-       st = bkey_s_c_to_snapshot_tree(k);
-       root_id = le32_to_cpu(st.v->root_snapshot);
-
-       struct bkey_s_c_snapshot snapshot_k =
-               bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
-                                        POS(0, root_id), 0, snapshot);
-       ret = bkey_err(snapshot_k);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               goto err;
-
-       if (!ret)
-               bkey_val_copy(&s, snapshot_k);
-
-       if (fsck_err_on(ret ||
-                       root_id != bch2_snapshot_root(c, root_id) ||
-                       st.k->p.offset != le32_to_cpu(s.tree),
-                       trans, snapshot_tree_to_missing_snapshot,
-                       "snapshot tree points to missing/incorrect snapshot:\n%s",
-                       (bch2_bkey_val_to_text(&buf, c, st.s_c),
-                        prt_newline(&buf),
-                        ret
-                        ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
-                        : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
-                        buf.buf))) {
-               ret = bch2_btree_delete_at(trans, iter, 0);
-               goto err;
-       }
-
-       if (!st.v->master_subvol)
-               goto out;
-
-       ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               goto err;
-
-       if (fsck_err_on(ret,
-                       trans, snapshot_tree_to_missing_subvol,
-                       "snapshot tree points to missing subvolume:\n%s",
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-           fsck_err_on(!bch2_snapshot_is_ancestor(c,
-                                               le32_to_cpu(subvol.snapshot),
-                                               root_id),
-                       trans, snapshot_tree_to_wrong_subvol,
-                       "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-           fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
-                       trans, snapshot_tree_to_snapshot_subvol,
-                       "snapshot tree points to snapshot subvolume:\n%s",
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
-               struct bkey_i_snapshot_tree *u;
-               u32 subvol_id;
-
-               ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
-               bch_err_fn(c, ret);
-
-               if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
-                       ret = 0;
-                       goto err;
-               }
-
-               if (ret)
-                       goto err;
-
-               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       goto err;
-
-               u->v.master_subvol = cpu_to_le32(subvol_id);
-               st = snapshot_tree_i_to_s_c(u);
-       }
-out:
-err:
-fsck_err:
-       bch2_trans_iter_exit(trans, &snapshot_iter);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-/*
- * For each snapshot_tree, make sure it points to the root of a snapshot tree
- * and that snapshot entry points back to it, or delete it.
- *
- * And, make sure it points to a subvolume within that snapshot tree, or correct
- * it to point to the oldest subvolume within that snapshot tree.
- */
-int bch2_check_snapshot_trees(struct bch_fs *c)
-{
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter,
-                       BTREE_ID_snapshot_trees, POS_MIN,
-                       BTREE_ITER_prefetch, k,
-                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-               check_snapshot_tree(trans, &iter, k)));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-/*
- * Look up snapshot tree for @tree_id and find root,
- * make sure @snap_id is a descendent:
- */
-static int snapshot_tree_ptr_good(struct btree_trans *trans,
-                                 u32 snap_id, u32 tree_id)
-{
-       struct bch_snapshot_tree s_t;
-       int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-
-       if (bch2_err_matches(ret, ENOENT))
-               return 0;
-       if (ret)
-               return ret;
-
-       return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
-}
-
-u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
-{
-       if (!id)
-               return 0;
-
-       guard(rcu)();
-       const struct snapshot_t *s = snapshot_t(c, id);
-       return s->parent
-               ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
-               : id;
-}
-
-static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
-{
-       unsigned i;
-
-       for (i = 0; i < 3; i++)
-               if (!s.parent) {
-                       if (s.skip[i])
-                               return false;
-               } else {
-                       if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
-                               return false;
-               }
-
-       return true;
-}
-
-/*
- * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
- * its snapshot_tree pointer is correct (allocate new one if necessary), then
- * update this node's pointer to root node's pointer:
- */
-static int snapshot_tree_ptr_repair(struct btree_trans *trans,
-                                   struct btree_iter *iter,
-                                   struct bkey_s_c k,
-                                   struct bch_snapshot *s)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter root_iter;
-       struct bch_snapshot_tree s_t;
-       struct bkey_s_c_snapshot root;
-       struct bkey_i_snapshot *u;
-       u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
-       int ret;
-
-       root = bch2_bkey_get_iter_typed(trans, &root_iter,
-                              BTREE_ID_snapshots, POS(0, root_id),
-                              BTREE_ITER_with_updates, snapshot);
-       ret = bkey_err(root);
-       if (ret)
-               goto err;
-
-       tree_id = le32_to_cpu(root.v->tree);
-
-       ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               return ret;
-
-       if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
-               u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
-               ret =   PTR_ERR_OR_ZERO(u) ?:
-                       bch2_snapshot_tree_create(trans, root_id,
-                               bch2_snapshot_oldest_subvol(c, root_id, NULL),
-                               &tree_id);
-               if (ret)
-                       goto err;
-
-               u->v.tree = cpu_to_le32(tree_id);
-               if (k.k->p.offset == root_id)
-                       *s = u->v;
-       }
-
-       if (k.k->p.offset != root_id) {
-               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       goto err;
-
-               u->v.tree = cpu_to_le32(tree_id);
-               *s = u->v;
-       }
-err:
-       bch2_trans_iter_exit(trans, &root_iter);
-       return ret;
-}
-
-static int check_snapshot(struct btree_trans *trans,
-                         struct btree_iter *iter,
-                         struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_snapshot s;
-       struct bch_subvolume subvol;
-       struct bch_snapshot v;
-       struct bkey_i_snapshot *u;
-       u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
-       u32 real_depth;
-       struct printbuf buf = PRINTBUF;
-       u32 i, id;
-       int ret = 0;
-
-       if (k.k->type != KEY_TYPE_snapshot)
-               return 0;
-
-       memset(&s, 0, sizeof(s));
-       memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
-
-       if (BCH_SNAPSHOT_DELETED(&s))
-               return 0;
-
-       id = le32_to_cpu(s.parent);
-       if (id) {
-               ret = bch2_snapshot_lookup(trans, id, &v);
-               if (bch2_err_matches(ret, ENOENT))
-                       bch_err(c, "snapshot with nonexistent parent:\n  %s",
-                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-               if (ret)
-                       goto err;
-
-               if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
-                   le32_to_cpu(v.children[1]) != k.k->p.offset) {
-                       bch_err(c, "snapshot parent %u missing pointer to child %llu",
-                               id, k.k->p.offset);
-                       ret = -EINVAL;
-                       goto err;
-               }
-       }
-
-       for (i = 0; i < 2 && s.children[i]; i++) {
-               id = le32_to_cpu(s.children[i]);
-
-               ret = bch2_snapshot_lookup(trans, id, &v);
-               if (bch2_err_matches(ret, ENOENT))
-                       bch_err(c, "snapshot node %llu has nonexistent child %u",
-                               k.k->p.offset, id);
-               if (ret)
-                       goto err;
-
-               if (le32_to_cpu(v.parent) != k.k->p.offset) {
-                       bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
-                               id, le32_to_cpu(v.parent), k.k->p.offset);
-                       ret = -EINVAL;
-                       goto err;
-               }
-       }
-
-       bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
-               !BCH_SNAPSHOT_WILL_DELETE(&s);
-
-       if (should_have_subvol) {
-               id = le32_to_cpu(s.subvol);
-               ret = bch2_subvolume_get(trans, id, false, &subvol);
-               if (bch2_err_matches(ret, ENOENT))
-                       bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
-                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-               if (ret)
-                       goto err;
-
-               if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
-                       bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-                               k.k->p.offset);
-                       ret = -EINVAL;
-                       goto err;
-               }
-       } else {
-               if (fsck_err_on(s.subvol,
-                               trans, snapshot_should_not_have_subvol,
-                               "snapshot should not point to subvol:\n%s",
-                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-                       u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-                       ret = PTR_ERR_OR_ZERO(u);
-                       if (ret)
-                               goto err;
-
-                       u->v.subvol = 0;
-                       s = u->v;
-               }
-       }
-
-       ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
-       if (ret < 0)
-               goto err;
-
-       if (fsck_err_on(!ret,
-                       trans, snapshot_to_bad_snapshot_tree,
-                       "snapshot points to missing/incorrect tree:\n%s",
-                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
-               if (ret)
-                       goto err;
-       }
-       ret = 0;
-
-       real_depth = bch2_snapshot_depth(c, parent_id);
-
-       if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
-                       trans, snapshot_bad_depth,
-                       "snapshot with incorrect depth field, should be %u:\n%s",
-                       real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       goto err;
-
-               u->v.depth = cpu_to_le32(real_depth);
-               s = u->v;
-       }
-
-       ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
-       if (ret < 0)
-               goto err;
-
-       if (fsck_err_on(!ret,
-                       trans, snapshot_bad_skiplist,
-                       "snapshot with bad skiplist field:\n%s",
-                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       goto err;
-
-               for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
-                       u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
-
-               bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
-               s = u->v;
-       }
-       ret = 0;
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_check_snapshots(struct bch_fs *c)
-{
-       /*
-        * We iterate backwards as checking/fixing the depth field requires that
-        * the parent's depth already be correct:
-        */
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_reverse_commit(trans, iter,
-                               BTREE_ID_snapshots, POS_MAX,
-                               BTREE_ITER_prefetch, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       check_snapshot(trans, &iter, k)));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int check_snapshot_exists(struct btree_trans *trans, u32 id)
-{
-       struct bch_fs *c = trans->c;
-
-       /* Do we need to reconstruct the snapshot_tree entry as well? */
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-       u32 tree_id = 0;
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
-                                    0, k, ret) {
-               if (k.k->type == KEY_TYPE_snapshot_tree &&
-                   le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
-                       tree_id = k.k->p.offset;
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (ret)
-               return ret;
-
-       if (!tree_id) {
-               ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
-               if (ret)
-                       return ret;
-       }
-
-       struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
-       ret = PTR_ERR_OR_ZERO(snapshot);
-       if (ret)
-               return ret;
-
-       bkey_snapshot_init(&snapshot->k_i);
-       snapshot->k.p           = POS(0, id);
-       snapshot->v.tree        = cpu_to_le32(tree_id);
-       snapshot->v.btime.lo    = cpu_to_le64(bch2_current_time(c));
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
-                                    0, k, ret) {
-               if (k.k->type == KEY_TYPE_subvolume &&
-                   le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
-                       snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
-                       SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return  bch2_snapshot_table_make_room(c, id) ?:
-               bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0);
-}
-
-/* Figure out which snapshot nodes belong in the same tree: */
-struct snapshot_tree_reconstruct {
-       enum btree_id                   btree;
-       struct bpos                     cur_pos;
-       snapshot_id_list                cur_ids;
-       DARRAY(snapshot_id_list)        trees;
-};
-
-static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
-{
-       darray_for_each(r->trees, i)
-               darray_exit(i);
-       darray_exit(&r->trees);
-       darray_exit(&r->cur_ids);
-}
-
-static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
-{
-       return r->btree == BTREE_ID_inodes
-               ? r->cur_pos.offset == pos.offset
-               : r->cur_pos.inode == pos.inode;
-}
-
-static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
-{
-       return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
-}
-
-static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
-{
-       bool first = true;
-       darray_for_each(*s, i) {
-               if (!first)
-                       prt_char(out, ' ');
-               first = false;
-               prt_printf(out, "%u", *i);
-       }
-}
-
-static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
-{
-       if (r->cur_ids.nr) {
-               darray_for_each(r->trees, i)
-                       if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
-                               int ret = snapshot_list_merge(c, i, &r->cur_ids);
-                               if (ret)
-                                       return ret;
-                               goto out;
-                       }
-               darray_push(&r->trees, r->cur_ids);
-               darray_init(&r->cur_ids);
-       }
-out:
-       r->cur_ids.nr = 0;
-       return 0;
-}
-
-static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
-{
-       if (!same_snapshot(r, pos))
-               snapshot_tree_reconstruct_next(c, r);
-       r->cur_pos = pos;
-       return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
-}
-
-int bch2_reconstruct_snapshots(struct bch_fs *c)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct printbuf buf = PRINTBUF;
-       struct snapshot_tree_reconstruct r = {};
-       int ret = 0;
-
-       for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
-               if (btree_type_has_snapshots(btree)) {
-                       r.btree = btree;
-
-                       ret = for_each_btree_key(trans, iter, btree, POS_MIN,
-                                       BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
-                               get_snapshot_trees(c, &r, k.k->p);
-                       }));
-                       if (ret)
-                               goto err;
-
-                       snapshot_tree_reconstruct_next(c, &r);
-               }
-       }
-
-       darray_for_each(r.trees, t) {
-               printbuf_reset(&buf);
-               snapshot_id_list_to_text(&buf, t);
-
-               darray_for_each(*t, id) {
-                       if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
-                                       trans, snapshot_node_missing,
-                                       "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
-                               if (t->nr > 1) {
-                                       bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
-                                       ret = bch_err_throw(c, fsck_repair_unimplemented);
-                                       goto err;
-                               }
-
-                               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                                               check_snapshot_exists(trans, *id));
-                               if (ret)
-                                       goto err;
-                       }
-               }
-       }
-fsck_err:
-err:
-       bch2_trans_put(trans);
-       snapshot_tree_reconstruct_exit(&r);
-       printbuf_exit(&buf);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-int __bch2_check_key_has_snapshot(struct btree_trans *trans,
-                                 struct btree_iter *iter,
-                                 struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       int ret = 0;
-       enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);
-
-       /* Snapshot was definitively deleted, this error is marked autofix */
-       if (fsck_err_on(state == SNAPSHOT_ID_deleted,
-                       trans, bkey_in_deleted_snapshot,
-                       "key in deleted snapshot %s, delete?",
-                       (bch2_btree_id_to_text(&buf, iter->btree_id),
-                        prt_char(&buf, ' '),
-                        bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-               ret = bch2_btree_delete_at(trans, iter,
-                                          BTREE_UPDATE_internal_snapshot_node) ?: 1;
-
-       if (state == SNAPSHOT_ID_empty) {
-               /*
-                * Snapshot missing: we should have caught this with btree_lost_data and
-                * kicked off reconstruct_snapshots, so if we end up here we have no
-                * idea what happened.
-                *
-                * Do not delete unless we know that subvolumes and snapshots
-                * are consistent:
-                *
-                * XXX:
-                *
-                * We could be smarter here, and instead of using the generic
-                * recovery pass ratelimiting, track if there have been any
-                * changes to the snapshots or inodes btrees since those passes
-                * last ran.
-                */
-               ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
-               ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
-
-               if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
-                       ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
-
-               unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
-
-               if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
-                            "key in missing snapshot %s, delete?",
-                            (bch2_btree_id_to_text(&buf, iter->btree_id),
-                             prt_char(&buf, ' '),
-                             bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-                       ret = bch2_btree_delete_at(trans, iter,
-                                                  BTREE_UPDATE_internal_snapshot_node) ?: 1;
-               }
-       }
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
-                                  enum btree_id btree, struct bpos pos,
-                                  snapshot_id_list *s)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
-                                            BTREE_ITER_all_snapshots, k, ret) {
-               if (!bkey_eq(k.k->p, pos))
-                       break;
-
-               if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
-                   snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
-                       continue;
-
-               ret = snapshot_list_add(c, s, k.k->p.snapshot);
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &iter);
-       if (ret)
-               darray_exit(s);
-
-       return ret;
-}
-
-/*
- * Mark a snapshot as deleted, for future cleanup:
- */
-int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
-{
-       struct btree_iter iter;
-       struct bkey_i_snapshot *s =
-               bch2_bkey_get_mut_typed(trans, &iter,
-                                   BTREE_ID_snapshots, POS(0, id),
-                                   0, snapshot);
-       int ret = PTR_ERR_OR_ZERO(s);
-       if (unlikely(ret)) {
-               bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
-                                       trans->c, "missing snapshot %u", id);
-               return ret;
-       }
-
-       /* already deleted? */
-       if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
-               goto err;
-
-       SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
-       SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
-       s->v.subvol = 0;
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
-{
-       if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
-               swap(s->children[0], s->children[1]);
-}
-
-static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter, p_iter = {};
-       struct btree_iter c_iter = {};
-       struct btree_iter tree_iter = {};
-       u32 parent_id, child_id;
-       unsigned i;
-       int ret = 0;
-
-       struct bkey_i_snapshot *s =
-               bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
-                                       BTREE_ITER_intent, snapshot);
-       ret = PTR_ERR_OR_ZERO(s);
-       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-                               "missing snapshot %u", id);
-
-       if (ret)
-               goto err;
-
-       BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
-       BUG_ON(s->v.children[1]);
-
-       parent_id = le32_to_cpu(s->v.parent);
-       child_id = le32_to_cpu(s->v.children[0]);
-
-       if (parent_id) {
-               struct bkey_i_snapshot *parent;
-
-               parent = bch2_bkey_get_mut_typed(trans, &p_iter,
-                                    BTREE_ID_snapshots, POS(0, parent_id),
-                                    0, snapshot);
-               ret = PTR_ERR_OR_ZERO(parent);
-               bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-                                       "missing snapshot %u", parent_id);
-               if (unlikely(ret))
-                       goto err;
-
-               /* find entry in parent->children for node being deleted */
-               for (i = 0; i < 2; i++)
-                       if (le32_to_cpu(parent->v.children[i]) == id)
-                               break;
-
-               if (bch2_fs_inconsistent_on(i == 2, c,
-                                       "snapshot %u missing child pointer to %u",
-                                       parent_id, id))
-                       goto err;
-
-               parent->v.children[i] = cpu_to_le32(child_id);
-
-               normalize_snapshot_child_pointers(&parent->v);
-       }
-
-       if (child_id) {
-               struct bkey_i_snapshot *child;
-
-               child = bch2_bkey_get_mut_typed(trans, &c_iter,
-                                    BTREE_ID_snapshots, POS(0, child_id),
-                                    0, snapshot);
-               ret = PTR_ERR_OR_ZERO(child);
-               bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-                                       "missing snapshot %u", child_id);
-               if (unlikely(ret))
-                       goto err;
-
-               child->v.parent = cpu_to_le32(parent_id);
-
-               if (!child->v.parent) {
-                       child->v.skip[0] = 0;
-                       child->v.skip[1] = 0;
-                       child->v.skip[2] = 0;
-               }
-       }
-
-       if (!parent_id) {
-               /*
-                * We're deleting the root of a snapshot tree: update the
-                * snapshot_tree entry to point to the new root, or delete it if
-                * this is the last snapshot ID in this tree:
-                */
-               struct bkey_i_snapshot_tree *s_t;
-
-               BUG_ON(s->v.children[1]);
-
-               s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
-                               BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
-                               0, snapshot_tree);
-               ret = PTR_ERR_OR_ZERO(s_t);
-               if (ret)
-                       goto err;
-
-               if (s->v.children[0]) {
-                       s_t->v.root_snapshot = s->v.children[0];
-               } else {
-                       s_t->k.type = KEY_TYPE_deleted;
-                       set_bkey_val_u64s(&s_t->k, 0);
-               }
-       }
-
-       if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
-               SET_BCH_SNAPSHOT_DELETED(&s->v, true);
-               s->v.parent             = 0;
-               s->v.children[0]        = 0;
-               s->v.children[1]        = 0;
-               s->v.subvol             = 0;
-               s->v.tree               = 0;
-               s->v.depth              = 0;
-               s->v.skip[0]            = 0;
-               s->v.skip[1]            = 0;
-               s->v.skip[2]            = 0;
-       } else {
-               s->k.type = KEY_TYPE_deleted;
-               set_bkey_val_u64s(&s->k, 0);
-       }
-err:
-       bch2_trans_iter_exit(trans, &tree_iter);
-       bch2_trans_iter_exit(trans, &p_iter);
-       bch2_trans_iter_exit(trans, &c_iter);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
-                         u32 *new_snapids,
-                         u32 *snapshot_subvols,
-                         unsigned nr_snapids)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_i_snapshot *n;
-       struct bkey_s_c k;
-       unsigned i, j;
-       u32 depth = bch2_snapshot_depth(c, parent);
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
-                            POS_MIN, BTREE_ITER_intent);
-       k = bch2_btree_iter_peek(trans, &iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       for (i = 0; i < nr_snapids; i++) {
-               k = bch2_btree_iter_prev_slot(trans, &iter);
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               if (!k.k || !k.k->p.offset) {
-                       ret = bch_err_throw(c, ENOSPC_snapshot_create);
-                       goto err;
-               }
-
-               n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
-               ret = PTR_ERR_OR_ZERO(n);
-               if (ret)
-                       goto err;
-
-               n->v.flags      = 0;
-               n->v.parent     = cpu_to_le32(parent);
-               n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
-               n->v.tree       = cpu_to_le32(tree);
-               n->v.depth      = cpu_to_le32(depth);
-               n->v.btime.lo   = cpu_to_le64(bch2_current_time(c));
-               n->v.btime.hi   = 0;
-
-               for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
-                       n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
-
-               bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
-               SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
-
-               ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
-                                        bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
-               if (ret)
-                       goto err;
-
-               new_snapids[i]  = iter.pos.offset;
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/*
- * Create new snapshot IDs as children of an existing snapshot ID:
- */
-static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
-                             u32 *new_snapids,
-                             u32 *snapshot_subvols,
-                             unsigned nr_snapids)
-{
-       struct btree_iter iter;
-       struct bkey_i_snapshot *n_parent;
-       int ret = 0;
-
-       n_parent = bch2_bkey_get_mut_typed(trans, &iter,
-                       BTREE_ID_snapshots, POS(0, parent),
-                       0, snapshot);
-       ret = PTR_ERR_OR_ZERO(n_parent);
-       if (unlikely(ret)) {
-               if (bch2_err_matches(ret, ENOENT))
-                       bch_err(trans->c, "snapshot %u not found", parent);
-               return ret;
-       }
-
-       if (n_parent->v.children[0] || n_parent->v.children[1]) {
-               bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
-               ret = -EINVAL;
-               goto err;
-       }
-
-       ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
-                            new_snapids, snapshot_subvols, nr_snapids);
-       if (ret)
-               goto err;
-
-       n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
-       n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
-       n_parent->v.subvol = 0;
-       SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/*
- * Create a snapshot node that is the root of a new tree:
- */
-static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
-                             u32 *new_snapids,
-                             u32 *snapshot_subvols,
-                             unsigned nr_snapids)
-{
-       struct bkey_i_snapshot_tree *n_tree;
-       int ret;
-
-       n_tree = __bch2_snapshot_tree_create(trans);
-       ret =   PTR_ERR_OR_ZERO(n_tree) ?:
-               create_snapids(trans, 0, n_tree->k.p.offset,
-                            new_snapids, snapshot_subvols, nr_snapids);
-       if (ret)
-               return ret;
-
-       n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
-       n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
-       return 0;
-}
-
-int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
-                             u32 *new_snapids,
-                             u32 *snapshot_subvols,
-                             unsigned nr_snapids)
-{
-       BUG_ON((parent == 0) != (nr_snapids == 1));
-       BUG_ON((parent != 0) != (nr_snapids == 2));
-
-       return parent
-               ? bch2_snapshot_node_create_children(trans, parent,
-                               new_snapids, snapshot_subvols, nr_snapids)
-               : bch2_snapshot_node_create_tree(trans,
-                               new_snapids, snapshot_subvols, nr_snapids);
-
-}
-
-/*
- * If we have an unlinked inode in an internal snapshot node, and the inode
- * really has been deleted in all child snapshots, how does this get cleaned up?
- *
- * first there is the problem of how keys that have been overwritten in all
- * child snapshots get deleted (unimplemented?), but inodes may perhaps be
- * special?
- *
- * also: unlinked inode in internal snapshot appears to not be getting deleted
- * correctly if inode doesn't exist in leaf snapshots
- *
- * solution:
- *
- * for a key in an interior snapshot node that needs work to be done that
- * requires it to be mutated: iterate over all descendent leaf nodes and copy
- * that key to snapshot leaf nodes, where we can mutate it
- */
-
-static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
-{
-       struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
-       return i ? i->live_child : 0;
-}
-
-static unsigned __live_child(struct snapshot_table *t, u32 id,
-                            snapshot_id_list *delete_leaves,
-                            interior_delete_list *delete_interior)
-{
-       struct snapshot_t *s = __snapshot_t(t, id);
-       if (!s)
-               return 0;
-
-       for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
-               if (s->children[i] &&
-                   !snapshot_list_has_id(delete_leaves, s->children[i]) &&
-                   !interior_delete_has_id(delete_interior, s->children[i]))
-                       return s->children[i];
-
-       for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
-               u32 live_child = s->children[i]
-                       ? __live_child(t, s->children[i], delete_leaves, delete_interior)
-                       : 0;
-               if (live_child)
-                       return live_child;
-       }
-
-       return 0;
-}
-
-static unsigned live_child(struct bch_fs *c, u32 id)
-{
-       struct snapshot_delete *d = &c->snapshot_delete;
-
-       guard(rcu)();
-       return __live_child(rcu_dereference(c->snapshots), id,
-                           &d->delete_leaves, &d->delete_interior);
-}
-
-static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
-{
-       return snapshot_list_has_id(&d->delete_leaves, id) ||
-               interior_delete_has_id(&d->delete_interior, id) != 0;
-}
-
-static int delete_dead_snapshots_process_key(struct btree_trans *trans,
-                                            struct btree_iter *iter,
-                                            struct bkey_s_c k)
-{
-       struct snapshot_delete *d = &trans->c->snapshot_delete;
-
-       if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
-               return bch2_btree_delete_at(trans, iter,
-                                           BTREE_UPDATE_internal_snapshot_node);
-
-       u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
-       if (live_child) {
-               struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-               int ret = PTR_ERR_OR_ZERO(new);
-               if (ret)
-                       return ret;
-
-               new->k.p.snapshot = live_child;
-
-               struct btree_iter dst_iter;
-               struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
-                                                          iter->btree_id, new->k.p,
-                                                          BTREE_ITER_all_snapshots|
-                                                          BTREE_ITER_intent);
-               ret = bkey_err(dst_k);
-               if (ret)
-                       return ret;
-
-               ret =   (bkey_deleted(dst_k.k)
-                        ? bch2_trans_update(trans, &dst_iter, new,
-                                            BTREE_UPDATE_internal_snapshot_node)
-                        : 0) ?:
-                       bch2_btree_delete_at(trans, iter,
-                                            BTREE_UPDATE_internal_snapshot_node);
-               bch2_trans_iter_exit(trans, &dst_iter);
-               return ret;
-       }
-
-       return 0;
-}
-
-static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
-{
-       struct bch_fs *c = trans->c;
-       struct snapshot_delete *d = &c->snapshot_delete;
-
-       u64 inum = iter->btree_id != BTREE_ID_inodes
-               ? iter->pos.inode
-               : iter->pos.offset;
-
-       if (*prev_inum == inum)
-               return false;
-
-       *prev_inum = inum;
-
-       bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
-                                        bch2_snapshot_tree(c, iter->pos.snapshot));
-       if (unlikely(ret)) {
-               struct bpos pos = iter->pos;
-               pos.snapshot = 0;
-               if (iter->btree_id != BTREE_ID_inodes)
-                       pos.offset = U64_MAX;
-               bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
-       }
-
-       return ret;
-}
-
-static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct snapshot_delete *d = &c->snapshot_delete;
-
-       for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
-               struct disk_reservation res = { 0 };
-               u64 prev_inum = 0;
-
-               d->pos.pos = POS_MIN;
-
-               if (!btree_type_has_snapshots(d->pos.btree))
-                       continue;
-
-               int ret = for_each_btree_key_commit(trans, iter,
-                               d->pos.btree, POS_MIN,
-                               BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-                               &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-                       d->pos.pos = iter.pos;
-
-                       if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
-                               continue;
-
-                       delete_dead_snapshots_process_key(trans, &iter, k);
-               }));
-
-               bch2_disk_reservation_put(c, &res);
-
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
-                                          struct bpos start, struct bpos end)
-{
-       struct bch_fs *c = trans->c;
-       struct snapshot_delete *d = &c->snapshot_delete;
-       struct disk_reservation res = { 0 };
-
-       d->pos.btree    = btree;
-       d->pos.pos      = POS_MIN;
-
-       int ret = for_each_btree_key_max_commit(trans, iter,
-                       btree, start, end,
-                       BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-                       &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-               d->pos.pos = iter.pos;
-               delete_dead_snapshots_process_key(trans, &iter, k);
-       }));
-
-       bch2_disk_reservation_put(c, &res);
-       return ret;
-}
-
-static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct snapshot_delete *d = &c->snapshot_delete;
-       struct disk_reservation res = { 0 };
-       u64 prev_inum = 0;
-       int ret = 0;
-
-       struct btree_iter iter;
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
-                            BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
-
-       while (1) {
-               struct bkey_s_c k;
-               ret = lockrestart_do(trans,
-                               bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
-               if (ret)
-                       break;
-
-               if (!k.k)
-                       break;
-
-               d->pos.btree    = iter.btree_id;
-               d->pos.pos      = iter.pos;
-
-               if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
-                       continue;
-
-               if (snapshot_id_dying(d, k.k->p.snapshot)) {
-                       struct bpos start       = POS(k.k->p.offset, 0);
-                       struct bpos end         = POS(k.k->p.offset, U64_MAX);
-
-                       ret   = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
-                               delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
-                               delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
-                       if (ret)
-                               break;
-
-                       bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
-               } else {
-                       bch2_btree_iter_advance(trans, &iter);
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (ret)
-               goto err;
-
-       prev_inum = 0;
-       ret = for_each_btree_key_commit(trans, iter,
-                       BTREE_ID_inodes, POS_MIN,
-                       BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-                       &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-               d->pos.btree    = iter.btree_id;
-               d->pos.pos      = iter.pos;
-
-               if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
-                       continue;
-
-               delete_dead_snapshots_process_key(trans, &iter, k);
-       }));
-err:
-       bch2_disk_reservation_put(c, &res);
-       return ret;
-}
-
-/*
- * For a given snapshot, if it doesn't have a subvolume that points to it, and
- * it doesn't have child snapshot nodes - it's now redundant and we can mark it
- * as deleted.
- */
-static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
-{
-       if (k.k->type != KEY_TYPE_snapshot)
-               return 0;
-
-       struct bch_fs *c = trans->c;
-       struct snapshot_delete *d = &c->snapshot_delete;
-       struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-       unsigned live_children = 0;
-       int ret = 0;
-
-       if (BCH_SNAPSHOT_SUBVOL(s.v))
-               return 0;
-
-       if (BCH_SNAPSHOT_DELETED(s.v))
-               return 0;
-
-       mutex_lock(&d->progress_lock);
-       for (unsigned i = 0; i < 2; i++) {
-               u32 child = le32_to_cpu(s.v->children[i]);
-
-               live_children += child &&
-                       !snapshot_list_has_id(&d->delete_leaves, child);
-       }
-
-       u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
-
-       if (live_children == 0) {
-               ret =   snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
-                       snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
-       } else if (live_children == 1) {
-               struct snapshot_interior_delete n = {
-                       .id             = s.k->p.offset,
-                       .live_child     = live_child(c, s.k->p.offset),
-               };
-
-               if (!n.live_child) {
-                       bch_err(c, "error finding live child of snapshot %u", n.id);
-                       ret = -EINVAL;
-               } else {
-                       ret =   snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
-                               darray_push(&d->delete_interior, n);
-               }
-       }
-       mutex_unlock(&d->progress_lock);
-
-       return ret;
-}
-
-static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
-                                               interior_delete_list *skip)
-{
-       guard(rcu)();
-       while (interior_delete_has_id(skip, id))
-               id = __bch2_snapshot_parent(c, id);
-
-       while (n--) {
-               do {
-                       id = __bch2_snapshot_parent(c, id);
-               } while (interior_delete_has_id(skip, id));
-       }
-
-       return id;
-}
-
-static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
-                                             struct btree_iter *iter, struct bkey_s_c k,
-                                             interior_delete_list *deleted)
-{
-       struct bch_fs *c = trans->c;
-       u32 nr_deleted_ancestors = 0;
-       struct bkey_i_snapshot *s;
-       int ret;
-
-       if (!bch2_snapshot_exists(c, k.k->p.offset))
-               return 0;
-
-       if (k.k->type != KEY_TYPE_snapshot)
-               return 0;
-
-       if (interior_delete_has_id(deleted, k.k->p.offset))
-               return 0;
-
-       s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
-       ret = PTR_ERR_OR_ZERO(s);
-       if (ret)
-               return ret;
-
-       darray_for_each(*deleted, i)
-               nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
-
-       if (!nr_deleted_ancestors)
-               return 0;
-
-       le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
-
-       if (!s->v.depth) {
-               s->v.skip[0] = 0;
-               s->v.skip[1] = 0;
-               s->v.skip[2] = 0;
-       } else {
-               u32 depth = le32_to_cpu(s->v.depth);
-               u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
-
-               for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
-                       u32 id = le32_to_cpu(s->v.skip[j]);
-
-                       if (interior_delete_has_id(deleted, id)) {
-                               id = bch2_snapshot_nth_parent_skip(c,
-                                                       parent,
-                                                       depth > 1
-                                                       ? get_random_u32_below(depth - 1)
-                                                       : 0,
-                                                       deleted);
-                               s->v.skip[j] = cpu_to_le32(id);
-                       }
-               }
-
-               bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
-       }
-
-       return bch2_trans_update(trans, iter, &s->k_i, 0);
-}
-
-static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
-{
-       prt_printf(out, "deleting from trees");
-       darray_for_each(d->deleting_from_trees, i)
-               prt_printf(out, " %u", *i);
-
-       prt_printf(out, "deleting leaves");
-       darray_for_each(d->delete_leaves, i)
-               prt_printf(out, " %u", *i);
-       prt_newline(out);
-
-       prt_printf(out, "interior");
-       darray_for_each(d->delete_interior, i)
-               prt_printf(out, " %u->%u", i->id, i->live_child);
-       prt_newline(out);
-}
-
-int __bch2_delete_dead_snapshots(struct bch_fs *c)
-{
-       struct snapshot_delete *d = &c->snapshot_delete;
-       int ret = 0;
-
-       if (!mutex_trylock(&d->lock))
-               return 0;
-
-       if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
-               goto out_unlock;
-
-       struct btree_trans *trans = bch2_trans_get(c);
-
-       /*
-        * For every snapshot node: If we have no live children and it's not
-        * pointed to by a subvolume, delete it:
-        */
-       d->running = true;
-       d->pos = BBPOS_MIN;
-
-       ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
-               check_should_delete_snapshot(trans, k));
-       if (!bch2_err_matches(ret, EROFS))
-               bch_err_msg(c, ret, "walking snapshots");
-       if (ret)
-               goto err;
-
-       if (!d->delete_leaves.nr && !d->delete_interior.nr)
-               goto err;
-
-       {
-               struct printbuf buf = PRINTBUF;
-               bch2_snapshot_delete_nodes_to_text(&buf, d);
-
-               ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
-               printbuf_exit(&buf);
-               if (ret)
-                       goto err;
-       }
-
-       ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
-               ? delete_dead_snapshot_keys_v2(trans)
-               : delete_dead_snapshot_keys_v1(trans);
-       if (!bch2_err_matches(ret, EROFS))
-               bch_err_msg(c, ret, "deleting keys from dying snapshots");
-       if (ret)
-               goto err;
-
-       darray_for_each(d->delete_leaves, i) {
-               ret = commit_do(trans, NULL, NULL, 0,
-                       bch2_snapshot_node_delete(trans, *i));
-               if (!bch2_err_matches(ret, EROFS))
-                       bch_err_msg(c, ret, "deleting snapshot %u", *i);
-               if (ret)
-                       goto err;
-       }
-
-       /*
-        * Fixing children of deleted snapshots can't be done completely
-        * atomically, if we crash between here and when we delete the interior
-        * nodes some depth fields will be off:
-        */
-       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
-                                 BTREE_ITER_intent, k,
-                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-               bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
-       if (ret)
-               goto err;
-
-       darray_for_each(d->delete_interior, i) {
-               ret = commit_do(trans, NULL, NULL, 0,
-                       bch2_snapshot_node_delete(trans, i->id));
-               if (!bch2_err_matches(ret, EROFS))
-                       bch_err_msg(c, ret, "deleting snapshot %u", i->id);
-               if (ret)
-                       goto err;
-       }
-err:
-       mutex_lock(&d->progress_lock);
-       darray_exit(&d->deleting_from_trees);
-       darray_exit(&d->delete_interior);
-       darray_exit(&d->delete_leaves);
-       d->running = false;
-       mutex_unlock(&d->progress_lock);
-       bch2_trans_put(trans);
-
-       bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
-out_unlock:
-       mutex_unlock(&d->lock);
-       if (!bch2_err_matches(ret, EROFS))
-               bch_err_fn(c, ret);
-       return ret;
-}
-
-int bch2_delete_dead_snapshots(struct bch_fs *c)
-{
-       if (!c->opts.auto_snapshot_deletion)
-               return 0;
-
-       return __bch2_delete_dead_snapshots(c);
-}
-
-void bch2_delete_dead_snapshots_work(struct work_struct *work)
-{
-       struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);
-
-       set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
-
-       bch2_delete_dead_snapshots(c);
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-void bch2_delete_dead_snapshots_async(struct bch_fs *c)
-{
-       if (!c->opts.auto_snapshot_deletion)
-               return;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots))
-               return;
-
-       BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
-
-       if (!queue_work(system_long_wq, &c->snapshot_delete.work))
-               enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       struct snapshot_delete *d = &c->snapshot_delete;
-
-       if (!d->running) {
-               prt_str(out, "(not running)");
-               return;
-       }
-
-       mutex_lock(&d->progress_lock);
-       bch2_snapshot_delete_nodes_to_text(out, d);
-
-       bch2_bbpos_to_text(out, d->pos);
-       mutex_unlock(&d->progress_lock);
-}
-
-int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
-                                      enum btree_id id,
-                                      struct bpos pos)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
-                                            BTREE_ITER_not_extents|
-                                            BTREE_ITER_all_snapshots,
-                                            k, ret) {
-               if (!bkey_eq(pos, k.k->p))
-                       break;
-
-               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
-                       ret = 1;
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
-{
-       /* If there's one child, it's redundant and keys will be moved to the child */
-       return !!snap.v->children[0] + !!snap.v->children[1] == 1;
-}
-
-static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
-{
-       if (k.k->type != KEY_TYPE_snapshot)
-               return 0;
-
-       struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
-       if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
-           interior_snapshot_needs_delete(snap))
-               set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
-
-       return 0;
-}
-
-int bch2_snapshots_read(struct bch_fs *c)
-{
-       /*
-        * Initializing the is_ancestor bitmaps requires ancestors to already be
-        * initialized - so mark in reverse:
-        */
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
-                                  POS_MAX, 0, k,
-                       __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-                       bch2_check_snapshot_needs_deletion(trans, k)));
-       bch_err_fn(c, ret);
-
-       /*
-        * It's important that we check if we need to reconstruct snapshots
-        * before going RW, so we mark that pass as required in the superblock -
-        * otherwise, we could end up deleting keys with missing snapshot nodes
-        * instead
-        */
-       BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
-              test_bit(BCH_FS_may_go_rw, &c->flags));
-
-       return ret;
-}
-
-void bch2_fs_snapshots_exit(struct bch_fs *c)
-{
-       kvfree(rcu_dereference_protected(c->snapshots, true));
-}
-
-void bch2_fs_snapshots_init_early(struct bch_fs *c)
-{
-       INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
-       mutex_init(&c->snapshot_delete.lock);
-       mutex_init(&c->snapshot_delete.progress_lock);
-       mutex_init(&c->snapshots_unlinked_lock);
-}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
deleted file mode 100644 (file)
index 6766bf6..0000000
+++ /dev/null
@@ -1,275 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_H
-#define _BCACHEFS_SNAPSHOT_H
-
-void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c,
-                               struct bkey_validate_context);
-
-#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {       \
-       .key_validate   = bch2_snapshot_tree_validate,          \
-       .val_to_text    = bch2_snapshot_tree_to_text,           \
-       .min_val_size   = 8,                                    \
-})
-
-struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
-
-int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
-
-void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c,
-                          struct bkey_validate_context);
-int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
-                      struct bkey_s_c, struct bkey_s,
-                      enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_snapshot ((struct bkey_ops) {            \
-       .key_validate   = bch2_snapshot_validate,               \
-       .val_to_text    = bch2_snapshot_to_text,                \
-       .trigger        = bch2_mark_snapshot,                   \
-       .min_val_size   = 24,                                   \
-})
-
-static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
-{
-       u32 idx = U32_MAX - id;
-
-       return likely(t && idx < t->nr)
-               ? &t->s[idx]
-               : NULL;
-}
-
-static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
-{
-       return __snapshot_t(rcu_dereference(c->snapshots), id);
-}
-
-static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
-{
-       guard(rcu)();
-       const struct snapshot_t *s = snapshot_t(c, id);
-       return s ? s->tree : 0;
-}
-
-static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
-       const struct snapshot_t *s = snapshot_t(c, id);
-       return s ? s->parent : 0;
-}
-
-static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
-       guard(rcu)();
-       return __bch2_snapshot_parent_early(c, id);
-}
-
-static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
-       const struct snapshot_t *s = snapshot_t(c, id);
-       if (!s)
-               return 0;
-
-       u32 parent = s->parent;
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-           parent &&
-           s->depth != snapshot_t(c, parent)->depth + 1)
-               panic("id %u depth=%u parent %u depth=%u\n",
-                     id, snapshot_t(c, id)->depth,
-                     parent, snapshot_t(c, parent)->depth);
-
-       return parent;
-}
-
-static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
-       guard(rcu)();
-       return __bch2_snapshot_parent(c, id);
-}
-
-static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
-{
-       guard(rcu)();
-       while (n--)
-               id = __bch2_snapshot_parent(c, id);
-       return id;
-}
-
-u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *);
-u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
-
-static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
-{
-       guard(rcu)();
-
-       u32 parent;
-       while ((parent = __bch2_snapshot_parent(c, id)))
-               id = parent;
-       return id;
-}
-
-static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id)
-{
-       const struct snapshot_t *s = snapshot_t(c, id);
-       return s ? s->state : SNAPSHOT_ID_empty;
-}
-
-static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id)
-{
-       guard(rcu)();
-       return __bch2_snapshot_id_state(c, id);
-}
-
-static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
-{
-       return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live;
-}
-
-static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
-{
-       guard(rcu)();
-       const struct snapshot_t *s = snapshot_t(c, id);
-       return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
-}
-
-static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
-       int ret = bch2_snapshot_is_internal_node(c, id);
-       if (ret < 0)
-               return ret;
-       return !ret;
-}
-
-static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
-{
-       guard(rcu)();
-       return parent ? snapshot_t(c, parent)->depth + 1 : 0;
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
-
-static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
-       return id == ancestor
-               ? true
-               : __bch2_snapshot_is_ancestor(c, id, ancestor);
-}
-
-static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
-{
-       guard(rcu)();
-       const struct snapshot_t *t = snapshot_t(c, id);
-       return t && (t->children[0]|t->children[1]) != 0;
-}
-
-static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
-{
-       return darray_find(*s, id) != NULL;
-}
-
-static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
-       darray_for_each(*s, i)
-               if (bch2_snapshot_is_ancestor(c, id, *i))
-                       return true;
-       return false;
-}
-
-static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
-       BUG_ON(snapshot_list_has_id(s, id));
-       int ret = darray_push(s, id);
-       if (ret)
-               bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
-       return ret;
-}
-
-static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
-       int ret = snapshot_list_has_id(s, id)
-               ? 0
-               : darray_push(s, id);
-       if (ret)
-               bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
-       return ret;
-}
-
-static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
-{
-       darray_for_each(*src, i) {
-               int ret = snapshot_list_add_nodup(c, dst, *i);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
-                        struct bch_snapshot *s);
-int bch2_snapshot_get_subvol(struct btree_trans *, u32,
-                            struct bch_subvolume *);
-
-/* only exported for tests: */
-int bch2_snapshot_node_create(struct btree_trans *, u32,
-                             u32 *, u32 *, unsigned);
-
-int bch2_check_snapshot_trees(struct bch_fs *);
-int bch2_check_snapshots(struct bch_fs *);
-int bch2_reconstruct_snapshots(struct bch_fs *);
-
-int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
-
-static inline int bch2_check_key_has_snapshot(struct btree_trans *trans,
-                                             struct btree_iter *iter,
-                                             struct bkey_s_c k)
-{
-       return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot))
-               ? 0
-               : __bch2_check_key_has_snapshot(trans, iter, k);
-}
-
-int __bch2_get_snapshot_overwrites(struct btree_trans *,
-                                  enum btree_id, struct bpos,
-                                  snapshot_id_list *);
-
-/*
- * Get a list of snapshot IDs that have overwritten a given key:
- */
-static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans,
-                                              enum btree_id btree, struct bpos pos,
-                                              snapshot_id_list *s)
-{
-       darray_init(s);
-
-       return bch2_snapshot_has_children(trans->c, pos.snapshot)
-               ? __bch2_get_snapshot_overwrites(trans, btree, pos, s)
-               : 0;
-
-}
-
-int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
-
-int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
-
-static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
-                                         enum btree_id id,
-                                         struct bpos pos)
-{
-       if (!btree_type_has_snapshots(id) ||
-           bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
-               return 0;
-
-       return __bch2_key_has_snapshot_overwrites(trans, id, pos);
-}
-
-int __bch2_delete_dead_snapshots(struct bch_fs *);
-int bch2_delete_dead_snapshots(struct bch_fs *);
-void bch2_delete_dead_snapshots_work(struct work_struct *);
-void bch2_delete_dead_snapshots_async(struct bch_fs *);
-void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *);
-
-int bch2_snapshots_read(struct bch_fs *);
-void bch2_fs_snapshots_exit(struct bch_fs *);
-void bch2_fs_snapshots_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
deleted file mode 100644 (file)
index 9bccae1..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
-#define _BCACHEFS_SNAPSHOT_FORMAT_H
-
-struct bch_snapshot {
-       struct bch_val          v;
-       __le32                  flags;
-       __le32                  parent;
-       __le32                  children[2];
-       __le32                  subvol;
-       /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
-       __le32                  tree;
-       __le32                  depth;
-       __le32                  skip[3];
-       bch_le128               btime;
-};
-
-LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags,  0,  1)
-/* True if a subvolume points to this snapshot node: */
-LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,      struct bch_snapshot, flags,  1,  2)
-LE32_BITMASK(BCH_SNAPSHOT_DELETED,     struct bch_snapshot, flags,  2,  3)
-
-/*
- * Snapshot trees:
- *
- * The snapshot_trees btree gives us persistent indentifier for each tree of
- * bch_snapshot nodes, and allow us to record and easily find the root/master
- * subvolume that other snapshots were created from:
- */
-struct bch_snapshot_tree {
-       struct bch_val          v;
-       __le32                  master_subvol;
-       __le32                  root_snapshot;
-};
-
-#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h
deleted file mode 100644 (file)
index 0ab698f..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_TYPES_H
-#define _BCACHEFS_SNAPSHOT_TYPES_H
-
-#include "bbpos_types.h"
-#include "darray.h"
-#include "subvolume_types.h"
-
-typedef DARRAY(u32) snapshot_id_list;
-
-#define IS_ANCESTOR_BITMAP     128
-
-struct snapshot_t {
-       enum snapshot_id_state {
-               SNAPSHOT_ID_empty,
-               SNAPSHOT_ID_live,
-               SNAPSHOT_ID_deleted,
-       }                       state;
-       u32                     parent;
-       u32                     skip[3];
-       u32                     depth;
-       u32                     children[2];
-       u32                     subvol; /* Nonzero only if a subvolume points to this node: */
-       u32                     tree;
-       unsigned long           is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
-};
-
-struct snapshot_table {
-       struct rcu_head         rcu;
-       size_t                  nr;
-#ifndef RUST_BINDGEN
-       DECLARE_FLEX_ARRAY(struct snapshot_t, s);
-#else
-       struct snapshot_t       s[0];
-#endif
-};
-
-struct snapshot_interior_delete {
-       u32     id;
-       u32     live_child;
-};
-typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
-
-struct snapshot_delete {
-       struct mutex            lock;
-       struct work_struct      work;
-
-       struct mutex            progress_lock;
-       snapshot_id_list        deleting_from_trees;
-       snapshot_id_list        delete_leaves;
-       interior_delete_list    delete_interior;
-
-       bool                    running;
-       struct bbpos            pos;
-};
-
-#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
deleted file mode 100644 (file)
index 3e9f592..0000000
+++ /dev/null
@@ -1,400 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "dirent.h"
-#include "fsck.h"
-#include "str_hash.h"
-#include "subvolume.h"
-
-static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
-{
-       if (d.v->d_type == DT_SUBVOL) {
-               struct bch_subvolume subvol;
-               int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol),
-                                            false, &subvol);
-               if (ret && !bch2_err_matches(ret, ENOENT))
-                       return ret;
-               return !ret;
-       } else {
-               struct btree_iter iter;
-               struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-                               SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
-               int ret = bkey_err(k);
-               if (ret)
-                       return ret;
-
-               ret = bkey_is_inode(k.k);
-               bch2_trans_iter_exit(trans, &iter);
-               return ret;
-       }
-}
-
-static int bch2_fsck_rename_dirent(struct btree_trans *trans,
-                                  struct snapshots_seen *s,
-                                  const struct bch_hash_desc desc,
-                                  struct bch_hash_info *hash_info,
-                                  struct bkey_s_c_dirent old,
-                                  bool *updated_before_k_pos)
-{
-       struct bch_fs *c = trans->c;
-       struct qstr old_name = bch2_dirent_get_name(old);
-       struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
-       int ret = PTR_ERR_OR_ZERO(new);
-       if (ret)
-               return ret;
-
-       bkey_dirent_init(&new->k_i);
-       dirent_copy_target(new, old);
-       new->k.p = old.k->p;
-
-       char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20);
-       ret = PTR_ERR_OR_ZERO(renamed_buf);
-       if (ret)
-               return ret;
-
-       for (unsigned i = 0; i < 1000; i++) {
-               new->k.u64s = BKEY_U64s_MAX;
-
-               struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf,
-                                       sprintf(renamed_buf, "%.*s.fsck_renamed-%u",
-                                               old_name.len, old_name.name, i));
-
-               ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL);
-               if (ret)
-                       return ret;
-
-               ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-                                               (subvol_inum) { 0, old.k->p.inode },
-                                               old.k->p.snapshot, &new->k_i,
-                                               BTREE_UPDATE_internal_snapshot_node|
-                                               STR_HASH_must_create);
-               if (ret && !bch2_err_matches(ret, EEXIST))
-                       break;
-               if (!ret) {
-                       if (bpos_lt(new->k.p, old.k->p))
-                               *updated_before_k_pos = true;
-                       break;
-               }
-       }
-
-       ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static noinline int hash_pick_winner(struct btree_trans *trans,
-                                    const struct bch_hash_desc desc,
-                                    struct bch_hash_info *hash_info,
-                                    struct bkey_s_c k1,
-                                    struct bkey_s_c k2)
-{
-       if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
-           !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
-               return 0;
-
-       switch (desc.btree_id) {
-       case BTREE_ID_dirents: {
-               int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1));
-               if (ret < 0)
-                       return ret;
-               if (!ret)
-                       return 0;
-
-               ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2));
-               if (ret < 0)
-                       return ret;
-               if (!ret)
-                       return 1;
-               return 2;
-       }
-       default:
-               return 0;
-       }
-}
-
-/*
- * str_hash lookups across snapshots break in wild ways if hash_info in
- * different snapshot versions doesn't match - so if we find one mismatch, check
- * them all
- */
-int bch2_repair_inode_hash_info(struct btree_trans *trans,
-                               struct bch_inode_unpacked *snapshot_root)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct printbuf buf = PRINTBUF;
-       bool need_commit = false;
-       int ret = 0;
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes,
-                                    POS(0, snapshot_root->bi_inum),
-                                    BTREE_ITER_all_snapshots, k, ret) {
-               if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot)))
-                       break;
-               if (!bkey_is_inode(k.k))
-                       continue;
-
-               struct bch_inode_unpacked inode;
-               ret = bch2_inode_unpack(k, &inode);
-               if (ret)
-                       break;
-
-               if (inode.bi_hash_seed          == snapshot_root->bi_hash_seed &&
-                   INODE_STR_HASH(&inode)      == INODE_STR_HASH(snapshot_root)) {
-#ifdef CONFIG_BCACHEFS_DEBUG
-                       struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root);
-                       struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
-
-                       BUG_ON(hash1.type != hash2.type ||
-                              memcmp(&hash1.siphash_key,
-                                     &hash2.siphash_key,
-                                     sizeof(hash1.siphash_key)));
-#endif
-                       continue;
-               }
-
-               printbuf_reset(&buf);
-               prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n",
-                          snapshot_root->bi_inum,
-                          inode.bi_snapshot,
-                          snapshot_root->bi_snapshot);
-
-               bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode));
-               prt_printf(&buf, " %llx\n", inode.bi_hash_seed);
-
-               bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
-               prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed);
-
-               if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) {
-                       inode.bi_hash_seed = snapshot_root->bi_hash_seed;
-                       SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
-
-                       ret = __bch2_fsck_write_inode(trans, &inode);
-                       if (ret)
-                               break;
-                       need_commit = true;
-               }
-       }
-
-       if (ret)
-               goto err;
-
-       if (!need_commit) {
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-
-               prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n",
-                          snapshot_root->bi_inum);
-
-               prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot);
-               bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
-               prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed);
-#if 0
-               prt_printf(&buf, "vs   snapshot %u ", hash_info->inum_snapshot);
-               bch2_prt_str_hash_type(&buf, hash_info->type);
-               prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1);
-#endif
-               bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-               ret = bch_err_throw(c, fsck_repair_unimplemented);
-               goto err;
-       }
-
-       ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-               -BCH_ERR_transaction_restart_nested;
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/*
- * All versions of the same inode in different snapshots must have the same hash
- * seed/type: verify that the hash info we're using matches the root
- */
-static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
-                                                      struct bch_hash_info *hash_info)
-{
-       struct bch_inode_unpacked snapshot_root;
-       int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root);
-       if (ret)
-               return ret;
-
-       struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root);
-       if (hash_info->type != hash_root.type ||
-           memcmp(&hash_info->siphash_key,
-                  &hash_root.siphash_key,
-                  sizeof(hash_root.siphash_key)))
-               ret = bch2_repair_inode_hash_info(trans, &snapshot_root);
-
-       return ret;
-}
-
-/* Put a str_hash key in its proper location, checking for duplicates */
-int bch2_str_hash_repair_key(struct btree_trans *trans,
-                            struct snapshots_seen *s,
-                            const struct bch_hash_desc *desc,
-                            struct bch_hash_info *hash_info,
-                            struct btree_iter *k_iter, struct bkey_s_c k,
-                            struct btree_iter *dup_iter, struct bkey_s_c dup_k,
-                            bool *updated_before_k_pos)
-{
-       struct bch_fs *c = trans->c;
-       struct printbuf buf = PRINTBUF;
-       bool free_snapshots_seen = false;
-       int ret = 0;
-
-       if (!s) {
-               s = bch2_trans_kmalloc(trans, sizeof(*s));
-               ret = PTR_ERR_OR_ZERO(s);
-               if (ret)
-                       goto out;
-
-               s->pos = k_iter->pos;
-               darray_init(&s->ids);
-
-               ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids);
-               if (ret)
-                       goto out;
-
-               free_snapshots_seen = true;
-       }
-
-       if (!dup_k.k) {
-               struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-               ret = PTR_ERR_OR_ZERO(new);
-               if (ret)
-                       goto out;
-
-               dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info,
-                                      (subvol_inum) { 0, new->k.p.inode },
-                                      new->k.p.snapshot, new,
-                                      STR_HASH_must_create|
-                                      BTREE_ITER_with_updates|
-                                      BTREE_UPDATE_internal_snapshot_node);
-               ret = bkey_err(dup_k);
-               if (ret)
-                       goto out;
-               if (dup_k.k)
-                       goto duplicate_entries;
-
-               if (bpos_lt(new->k.p, k.k->p))
-                       *updated_before_k_pos = true;
-
-               ret =   bch2_insert_snapshot_whiteouts(trans, desc->btree_id,
-                                                      k_iter->pos, new->k.p) ?:
-                       bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
-                                           BTREE_ITER_with_updates|
-                                           BTREE_UPDATE_internal_snapshot_node) ?:
-                       bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
-                       bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-                       -BCH_ERR_transaction_restart_commit;
-       } else {
-duplicate_entries:
-               ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k);
-               if (ret < 0)
-                       goto out;
-
-               if (!fsck_err(trans, hash_table_key_duplicate,
-                             "duplicate hash table keys%s:\n%s",
-                             ret != 2 ? "" : ", both point to valid inodes",
-                             (printbuf_reset(&buf),
-                              bch2_bkey_val_to_text(&buf, c, k),
-                              prt_newline(&buf),
-                              bch2_bkey_val_to_text(&buf, c, dup_k),
-                              buf.buf)))
-                       goto out;
-
-               switch (ret) {
-               case 0:
-                       ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
-                       break;
-               case 1:
-                       ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0);
-                       break;
-               case 2:
-                       ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info,
-                                                     bkey_s_c_to_dirent(k),
-                                                     updated_before_k_pos) ?:
-                               bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
-                                                   BTREE_ITER_with_updates);
-                       goto out;
-               }
-
-               ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
-                       -BCH_ERR_transaction_restart_commit;
-       }
-out:
-fsck_err:
-       bch2_trans_iter_exit(trans, dup_iter);
-       printbuf_exit(&buf);
-       if (free_snapshots_seen)
-               darray_exit(&s->ids);
-       return ret;
-}
-
-int __bch2_str_hash_check_key(struct btree_trans *trans,
-                             struct snapshots_seen *s,
-                             const struct bch_hash_desc *desc,
-                             struct bch_hash_info *hash_info,
-                             struct btree_iter *k_iter, struct bkey_s_c hash_k,
-                             bool *updated_before_k_pos)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter = {};
-       struct printbuf buf = PRINTBUF;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       u64 hash = desc->hash_bkey(hash_info, hash_k);
-       if (hash_k.k->p.offset < hash)
-               goto bad_hash;
-
-       for_each_btree_key_norestart(trans, iter, desc->btree_id,
-                                    SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
-                                    BTREE_ITER_slots|
-                                    BTREE_ITER_with_updates, k, ret) {
-               if (bkey_eq(k.k->p, hash_k.k->p))
-                       break;
-
-               if (k.k->type == desc->key_type &&
-                   !desc->cmp_bkey(k, hash_k)) {
-                       ret =   check_inode_hash_info_matches_root(trans, hash_k.k->p.inode,
-                                                                  hash_info) ?:
-                               bch2_str_hash_repair_key(trans, s, desc, hash_info,
-                                                        k_iter, hash_k,
-                                                        &iter, k, updated_before_k_pos);
-                       break;
-               }
-
-               if (bkey_deleted(k.k))
-                       goto bad_hash;
-       }
-       bch2_trans_iter_exit(trans, &iter);
-out:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-bad_hash:
-       bch2_trans_iter_exit(trans, &iter);
-       /*
-        * Before doing any repair, check hash_info itself:
-        */
-       ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info);
-       if (ret)
-               goto out;
-
-       if (fsck_err(trans, hash_table_key_wrong_offset,
-                    "hash table key at wrong offset: should be at %llu\n%s",
-                    hash,
-                    (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)))
-               ret = bch2_str_hash_repair_key(trans, s, desc, hash_info,
-                                              k_iter, hash_k,
-                                              &iter, bkey_s_c_null,
-                                              updated_before_k_pos);
-       goto out;
-}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
deleted file mode 100644 (file)
index 8979ac2..0000000
+++ /dev/null
@@ -1,431 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_STR_HASH_H
-#define _BCACHEFS_STR_HASH_H
-
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "checksum.h"
-#include "error.h"
-#include "inode.h"
-#include "siphash.h"
-#include "subvolume.h"
-#include "super.h"
-
-#include <linux/crc32c.h>
-#include <crypto/sha2.h>
-
-static inline enum bch_str_hash_type
-bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
-{
-       switch (opt) {
-       case BCH_STR_HASH_OPT_crc32c:
-               return BCH_STR_HASH_crc32c;
-       case BCH_STR_HASH_OPT_crc64:
-               return BCH_STR_HASH_crc64;
-       case BCH_STR_HASH_OPT_siphash:
-               return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
-                       ? BCH_STR_HASH_siphash
-                       : BCH_STR_HASH_siphash_old;
-       default:
-            BUG();
-       }
-}
-
-struct bch_hash_info {
-       u32                     inum_snapshot;
-       u8                      type;
-       struct unicode_map      *cf_encoding;
-       /*
-        * For crc32 or crc64 string hashes the first key value of
-        * the siphash_key (k0) is used as the key.
-        */
-       SIPHASH_KEY     siphash_key;
-};
-
-static inline struct bch_hash_info
-bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
-{
-       struct bch_hash_info info = {
-               .inum_snapshot  = bi->bi_snapshot,
-               .type           = INODE_STR_HASH(bi),
-               .cf_encoding    = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
-               .siphash_key    = { .k0 = bi->bi_hash_seed }
-       };
-
-       if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
-               u8 digest[SHA256_DIGEST_SIZE];
-
-               sha256((const u8 *)&bi->bi_hash_seed,
-                      sizeof(bi->bi_hash_seed), digest);
-               memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
-       }
-
-       return info;
-}
-
-struct bch_str_hash_ctx {
-       union {
-               u32             crc32c;
-               u64             crc64;
-               SIPHASH_CTX     siphash;
-       };
-};
-
-static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
-                                    const struct bch_hash_info *info)
-{
-       switch (info->type) {
-       case BCH_STR_HASH_crc32c:
-               ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
-                                    sizeof(info->siphash_key.k0));
-               break;
-       case BCH_STR_HASH_crc64:
-               ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
-                                     sizeof(info->siphash_key.k0));
-               break;
-       case BCH_STR_HASH_siphash_old:
-       case BCH_STR_HASH_siphash:
-               SipHash24_Init(&ctx->siphash, &info->siphash_key);
-               break;
-       default:
-               BUG();
-       }
-}
-
-static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
-                                      const struct bch_hash_info *info,
-                                      const void *data, size_t len)
-{
-       switch (info->type) {
-       case BCH_STR_HASH_crc32c:
-               ctx->crc32c = crc32c(ctx->crc32c, data, len);
-               break;
-       case BCH_STR_HASH_crc64:
-               ctx->crc64 = crc64_be(ctx->crc64, data, len);
-               break;
-       case BCH_STR_HASH_siphash_old:
-       case BCH_STR_HASH_siphash:
-               SipHash24_Update(&ctx->siphash, data, len);
-               break;
-       default:
-               BUG();
-       }
-}
-
-static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
-                                  const struct bch_hash_info *info)
-{
-       switch (info->type) {
-       case BCH_STR_HASH_crc32c:
-               return ctx->crc32c;
-       case BCH_STR_HASH_crc64:
-               return ctx->crc64 >> 1;
-       case BCH_STR_HASH_siphash_old:
-       case BCH_STR_HASH_siphash:
-               return SipHash24_End(&ctx->siphash) >> 1;
-       default:
-               BUG();
-       }
-}
-
-struct bch_hash_desc {
-       enum btree_id   btree_id;
-       u8              key_type;
-
-       u64             (*hash_key)(const struct bch_hash_info *, const void *);
-       u64             (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
-       bool            (*cmp_key)(struct bkey_s_c, const void *);
-       bool            (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
-       bool            (*is_visible)(subvol_inum inum, struct bkey_s_c);
-};
-
-static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
-{
-       return k.k->type == desc.key_type &&
-               (!desc.is_visible ||
-                !inum.inum ||
-                desc.is_visible(inum, k));
-}
-
-static __always_inline struct bkey_s_c
-bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
-                struct btree_iter *iter,
-                const struct bch_hash_desc desc,
-                const struct bch_hash_info *info,
-                subvol_inum inum, const void *key,
-                enum btree_iter_update_trigger_flags flags,
-                u32 snapshot)
-{
-       struct bkey_s_c k;
-       int ret;
-
-       for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
-                          SPOS(inum.inum, desc.hash_key(info, key), snapshot),
-                          POS(inum.inum, U64_MAX),
-                          BTREE_ITER_slots|flags, k, ret) {
-               if (is_visible_key(desc, inum, k)) {
-                       if (!desc.cmp_key(k, key))
-                               return k;
-               } else if (k.k->type == KEY_TYPE_hash_whiteout) {
-                       ;
-               } else {
-                       /* hole, not found */
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, iter);
-
-       return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup);
-}
-
-static __always_inline struct bkey_s_c
-bch2_hash_lookup(struct btree_trans *trans,
-                struct btree_iter *iter,
-                const struct bch_hash_desc desc,
-                const struct bch_hash_info *info,
-                subvol_inum inum, const void *key,
-                enum btree_iter_update_trigger_flags flags)
-{
-       u32 snapshot;
-       int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-       if (ret)
-               return bkey_s_c_err(ret);
-
-       return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
-}
-
-static __always_inline int
-bch2_hash_hole(struct btree_trans *trans,
-              struct btree_iter *iter,
-              const struct bch_hash_desc desc,
-              const struct bch_hash_info *info,
-              subvol_inum inum, const void *key)
-{
-       struct bkey_s_c k;
-       u32 snapshot;
-       int ret;
-
-       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-       if (ret)
-               return ret;
-
-       for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
-                          SPOS(inum.inum, desc.hash_key(info, key), snapshot),
-                          POS(inum.inum, U64_MAX),
-                          BTREE_ITER_slots|BTREE_ITER_intent, k, ret)
-               if (!is_visible_key(desc, inum, k))
-                       return 0;
-       bch2_trans_iter_exit(trans, iter);
-
-       return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
-}
-
-static __always_inline
-int bch2_hash_needs_whiteout(struct btree_trans *trans,
-                            const struct bch_hash_desc desc,
-                            const struct bch_hash_info *info,
-                            struct btree_iter *start)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       bch2_trans_copy_iter(trans, &iter, start);
-
-       bch2_btree_iter_advance(trans, &iter);
-
-       for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) {
-               if (k.k->type != desc.key_type &&
-                   k.k->type != KEY_TYPE_hash_whiteout)
-                       break;
-
-               if (k.k->type == desc.key_type &&
-                   desc.hash_bkey(info, k) <= start->pos.offset) {
-                       ret = 1;
-                       break;
-               }
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static __always_inline
-struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
-                          struct btree_iter *iter,
-                          const struct bch_hash_desc desc,
-                          const struct bch_hash_info *info,
-                          subvol_inum inum, u32 snapshot,
-                          struct bkey_i *insert,
-                          enum btree_iter_update_trigger_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter slot = {};
-       struct bkey_s_c k;
-       bool found = false;
-       int ret;
-
-       for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
-                          SPOS(insert->k.p.inode,
-                               desc.hash_bkey(info, bkey_i_to_s_c(insert)),
-                               snapshot),
-                          POS(insert->k.p.inode, U64_MAX),
-                          BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
-               if (is_visible_key(desc, inum, k)) {
-                       if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
-                               goto found;
-
-                       /* hash collision: */
-                       continue;
-               }
-
-               if (!slot.path && !(flags & STR_HASH_must_replace))
-                       bch2_trans_copy_iter(trans, &slot, iter);
-
-               if (k.k->type != KEY_TYPE_hash_whiteout)
-                       goto not_found;
-       }
-
-       if (!ret)
-               ret = bch_err_throw(c, ENOSPC_str_hash_create);
-out:
-       bch2_trans_iter_exit(trans, &slot);
-       bch2_trans_iter_exit(trans, iter);
-       return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-found:
-       found = true;
-not_found:
-       if (found && (flags & STR_HASH_must_create)) {
-               bch2_trans_iter_exit(trans, &slot);
-               return k;
-       } else if (!found && (flags & STR_HASH_must_replace)) {
-               ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace);
-       } else {
-               if (!found && slot.path)
-                       swap(*iter, slot);
-
-               insert->k.p = iter->pos;
-               ret = bch2_trans_update(trans, iter, insert, flags);
-       }
-
-       goto out;
-}
-
-static __always_inline
-int bch2_hash_set_in_snapshot(struct btree_trans *trans,
-                          const struct bch_hash_desc desc,
-                          const struct bch_hash_info *info,
-                          subvol_inum inum, u32 snapshot,
-                          struct bkey_i *insert,
-                          enum btree_iter_update_trigger_flags flags)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum,
-                                                            snapshot, insert, flags);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-       if (k.k) {
-               bch2_trans_iter_exit(trans, &iter);
-               return bch_err_throw(trans->c, EEXIST_str_hash_set);
-       }
-
-       return 0;
-}
-
-static __always_inline
-int bch2_hash_set(struct btree_trans *trans,
-                 const struct bch_hash_desc desc,
-                 const struct bch_hash_info *info,
-                 subvol_inum inum,
-                 struct bkey_i *insert,
-                 enum btree_iter_update_trigger_flags flags)
-{
-       insert->k.p.inode = inum.inum;
-
-       u32 snapshot;
-       return  bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
-               bch2_hash_set_in_snapshot(trans, desc, info, inum,
-                                         snapshot, insert, flags);
-}
-
-static __always_inline
-int bch2_hash_delete_at(struct btree_trans *trans,
-                       const struct bch_hash_desc desc,
-                       const struct bch_hash_info *info,
-                       struct btree_iter *iter,
-                       enum btree_iter_update_trigger_flags flags)
-{
-       struct bkey_i *delete;
-       int ret;
-
-       delete = bch2_trans_kmalloc(trans, sizeof(*delete));
-       ret = PTR_ERR_OR_ZERO(delete);
-       if (ret)
-               return ret;
-
-       ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
-       if (ret < 0)
-               return ret;
-
-       bkey_init(&delete->k);
-       delete->k.p = iter->pos;
-       delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
-
-       return bch2_trans_update(trans, iter, delete, flags);
-}
-
-static __always_inline
-int bch2_hash_delete(struct btree_trans *trans,
-                    const struct bch_hash_desc desc,
-                    const struct bch_hash_info *info,
-                    subvol_inum inum, const void *key)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
-                                            BTREE_ITER_intent);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *);
-
-struct snapshots_seen;
-int bch2_str_hash_repair_key(struct btree_trans *,
-                            struct snapshots_seen *,
-                            const struct bch_hash_desc *,
-                            struct bch_hash_info *,
-                            struct btree_iter *, struct bkey_s_c,
-                            struct btree_iter *, struct bkey_s_c,
-                            bool *);
-
-int __bch2_str_hash_check_key(struct btree_trans *,
-                             struct snapshots_seen *,
-                             const struct bch_hash_desc *,
-                             struct bch_hash_info *,
-                             struct btree_iter *, struct bkey_s_c,
-                             bool *);
-
-static inline int bch2_str_hash_check_key(struct btree_trans *trans,
-                           struct snapshots_seen *s,
-                           const struct bch_hash_desc *desc,
-                           struct bch_hash_info *hash_info,
-                           struct btree_iter *k_iter, struct bkey_s_c hash_k,
-                           bool *updated_before_k_pos)
-{
-       if (hash_k.k->type != desc->key_type)
-               return 0;
-
-       if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
-               return 0;
-
-       return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k,
-                                        updated_before_k_pos);
-}
-
-#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
deleted file mode 100644 (file)
index 0205874..0000000
+++ /dev/null
@@ -1,752 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "subvolume.h"
-
-#include <linux/random.h>
-
-static int bch2_subvolume_delete(struct btree_trans *, u32);
-
-static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid)
-{
-       struct printbuf buf = PRINTBUF;
-       bch2_log_msg_start(c, &buf);
-
-       prt_printf(&buf, "missing subvolume %u", subvolid);
-       bool print = bch2_count_fsck_err(c, subvol_missing, &buf);
-
-       int ret = bch2_run_explicit_recovery_pass(c, &buf,
-                                       BCH_RECOVERY_PASS_check_inodes, 0);
-       if (print)
-               bch2_print_str(c, KERN_ERR, buf.buf);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static struct bpos subvolume_children_pos(struct bkey_s_c k)
-{
-       if (k.k->type != KEY_TYPE_subvolume)
-               return POS_MIN;
-
-       struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-       if (!s.v->fs_path_parent)
-               return POS_MIN;
-       return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
-}
-
-static int check_subvol(struct btree_trans *trans,
-                       struct btree_iter *iter,
-                       struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_s_c_subvolume subvol;
-       struct btree_iter subvol_children_iter = {};
-       struct bch_snapshot snapshot;
-       struct printbuf buf = PRINTBUF;
-       unsigned snapid;
-       int ret = 0;
-
-       if (k.k->type != KEY_TYPE_subvolume)
-               return 0;
-
-       subvol = bkey_s_c_to_subvolume(k);
-       snapid = le32_to_cpu(subvol.v->snapshot);
-       ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
-
-       if (bch2_err_matches(ret, ENOENT))
-               return bch2_run_print_explicit_recovery_pass(c,
-                                       BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
-       if (ret)
-               return ret;
-
-       if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
-               ret = bch2_subvolume_delete(trans, iter->pos.offset);
-               bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
-               return ret ?: -BCH_ERR_transaction_restart_nested;
-       }
-
-       if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
-                       subvol.v->fs_path_parent,
-                       trans, subvol_root_fs_path_parent_nonzero,
-                       "root subvolume has nonzero fs_path_parent\n%s",
-                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               struct bkey_i_subvolume *n =
-                       bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
-               ret = PTR_ERR_OR_ZERO(n);
-               if (ret)
-                       goto err;
-
-               n->v.fs_path_parent = 0;
-       }
-
-       if (subvol.v->fs_path_parent) {
-               struct bpos pos = subvolume_children_pos(k);
-
-               struct bkey_s_c subvol_children_k =
-                       bch2_bkey_get_iter(trans, &subvol_children_iter,
-                                          BTREE_ID_subvolume_children, pos, 0);
-               ret = bkey_err(subvol_children_k);
-               if (ret)
-                       goto err;
-
-               if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
-                               trans, subvol_children_not_set,
-                               "subvolume not set in subvolume_children btree at %llu:%llu\n%s",
-                               pos.inode, pos.offset,
-                               (printbuf_reset(&buf),
-                                bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-                       ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
-                       if (ret)
-                               goto err;
-               }
-       }
-
-       struct bch_inode_unpacked inode;
-       ret = bch2_inode_find_by_inum_nowarn_trans(trans,
-                                   (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
-                                   &inode);
-       if (!ret) {
-               if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
-                               trans, subvol_root_wrong_bi_subvol,
-                               "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
-                               inode.bi_inum, inode.bi_snapshot,
-                               inode.bi_subvol, subvol.k->p.offset)) {
-                       inode.bi_subvol = subvol.k->p.offset;
-                       inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
-                       ret = __bch2_fsck_write_inode(trans, &inode);
-                       if (ret)
-                               goto err;
-               }
-       } else if (bch2_err_matches(ret, ENOENT)) {
-               if (fsck_err(trans, subvol_to_missing_root,
-                            "subvolume %llu points to missing subvolume root %llu:%u",
-                            k.k->p.offset, le64_to_cpu(subvol.v->inode),
-                            le32_to_cpu(subvol.v->snapshot))) {
-                       /*
-                        * Recreate - any contents that are still disconnected
-                        * will then get reattached under lost+found
-                        */
-                       bch2_inode_init_early(c, &inode);
-                       bch2_inode_init_late(c, &inode, bch2_current_time(c),
-                                            0, 0, S_IFDIR|0700, 0, NULL);
-                       inode.bi_inum                   = le64_to_cpu(subvol.v->inode);
-                       inode.bi_snapshot               = le32_to_cpu(subvol.v->snapshot);
-                       inode.bi_subvol                 = k.k->p.offset;
-                       inode.bi_parent_subvol          = le32_to_cpu(subvol.v->fs_path_parent);
-                       ret = __bch2_fsck_write_inode(trans, &inode);
-                       if (ret)
-                               goto err;
-               }
-       } else {
-               goto err;
-       }
-
-       if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
-               u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
-               u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root);
-
-               struct bch_snapshot_tree st;
-               ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
-
-               bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-                               "%s: snapshot tree %u not found", __func__, snapshot_tree);
-
-               if (ret)
-                       goto err;
-
-               if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
-                               trans, subvol_not_master_and_not_snapshot,
-                               "subvolume %llu is not set as snapshot but is not master subvolume",
-                               k.k->p.offset)) {
-                       struct bkey_i_subvolume *s =
-                               bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
-                       ret = PTR_ERR_OR_ZERO(s);
-                       if (ret)
-                               goto err;
-
-                       SET_BCH_SUBVOLUME_SNAP(&s->v, true);
-               }
-       }
-err:
-fsck_err:
-       bch2_trans_iter_exit(trans, &subvol_children_iter);
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_check_subvols(struct bch_fs *c)
-{
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter,
-                               BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       check_subvol(trans, &iter, k)));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int check_subvol_child(struct btree_trans *trans,
-                             struct btree_iter *child_iter,
-                             struct bkey_s_c child_k)
-{
-       struct bch_subvolume s;
-       int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
-                                         0, subvolume, &s);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               return ret;
-
-       if (fsck_err_on(ret ||
-                       le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
-                       trans, subvol_children_bad,
-                       "incorrect entry in subvolume_children btree %llu:%llu",
-                       child_k.k->p.inode, child_k.k->p.offset)) {
-               ret = bch2_btree_delete_at(trans, child_iter, 0);
-               if (ret)
-                       goto err;
-       }
-err:
-fsck_err:
-       return ret;
-}
-
-int bch2_check_subvol_children(struct bch_fs *c)
-{
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter,
-                               BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       check_subvol_child(trans, &iter, k)));
-       bch_err_fn(c, ret);
-       return 0;
-}
-
-/* Subvolumes: */
-
-int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k,
-                           struct bkey_validate_context from)
-{
-       struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
-                        bkey_gt(k.k->p, SUBVOL_POS_MAX),
-                        c, subvol_pos_bad,
-                        "invalid pos");
-
-       bkey_fsck_err_on(!subvol.v->snapshot,
-                        c, subvol_snapshot_bad,
-                        "invalid snapshot");
-
-       bkey_fsck_err_on(!subvol.v->inode,
-                        c, subvol_inode_bad,
-                        "invalid inode");
-fsck_err:
-       return ret;
-}
-
-void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
-                           struct bkey_s_c k)
-{
-       struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-
-       prt_printf(out, "root %llu snapshot id %u",
-                  le64_to_cpu(s.v->inode),
-                  le32_to_cpu(s.v->snapshot));
-
-       if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
-               prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
-               prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
-       }
-
-       if (BCH_SUBVOLUME_RO(s.v))
-               prt_printf(out, " ro");
-       if (BCH_SUBVOLUME_SNAP(s.v))
-               prt_printf(out, " snapshot");
-       if (BCH_SUBVOLUME_UNLINKED(s.v))
-               prt_printf(out, " unlinked");
-}
-
-static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
-{
-       return !bpos_eq(pos, POS_MIN)
-               ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
-               : 0;
-}
-
-int bch2_subvolume_trigger(struct btree_trans *trans,
-                          enum btree_id btree_id, unsigned level,
-                          struct bkey_s_c old, struct bkey_s new,
-                          enum btree_iter_update_trigger_flags flags)
-{
-       if (flags & BTREE_TRIGGER_transactional) {
-               struct bpos children_pos_old = subvolume_children_pos(old);
-               struct bpos children_pos_new = subvolume_children_pos(new.s_c);
-
-               if (!bpos_eq(children_pos_old, children_pos_new)) {
-                       int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
-                                 subvolume_children_mod(trans, children_pos_new, true);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       return 0;
-}
-
-int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
-{
-       struct btree_iter iter;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
-       struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter);
-       bch2_trans_iter_exit(trans, &iter);
-
-       return bkey_err(k) ?: k.k && k.k->p.inode == subvol
-               ? -BCH_ERR_ENOTEMPTY_subvol_not_empty
-               : 0;
-}
-
-static __always_inline int
-bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
-                          bool inconsistent_if_not_found,
-                          struct bch_subvolume *s)
-{
-       int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
-                                         BTREE_ITER_cached|
-                                         BTREE_ITER_with_updates, subvolume, s);
-       if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found)
-               ret = bch2_subvolume_missing(trans->c, subvol) ?: ret;
-       return ret;
-}
-
-int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
-                      bool inconsistent_if_not_found,
-                      struct bch_subvolume *s)
-{
-       return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s);
-}
-
-int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
-{
-       struct bch_subvolume s;
-       int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s);
-       if (ret)
-               return ret;
-
-       if (BCH_SUBVOLUME_RO(&s))
-               return -EROFS;
-       return 0;
-}
-
-int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
-{
-       return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol));
-}
-
-int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
-                            struct bch_subvolume *subvol)
-{
-       struct bch_snapshot snap;
-
-       return  bch2_snapshot_lookup(trans, snapshot, &snap) ?:
-               bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol);
-}
-
-int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
-                                 u32 *snapid, bool warn)
-{
-       struct btree_iter iter;
-       struct bkey_s_c_subvolume subvol;
-       int ret;
-
-       subvol = bch2_bkey_get_iter_typed(trans, &iter,
-                                         BTREE_ID_subvolumes, POS(0, subvolid),
-                                         BTREE_ITER_cached|BTREE_ITER_with_updates,
-                                         subvolume);
-       ret = bkey_err(subvol);
-
-       if (bch2_err_matches(ret, ENOENT))
-               ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
-
-       if (likely(!ret))
-               *snapid = le32_to_cpu(subvol.v->snapshot);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
-                               u32 *snapid)
-{
-       return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true);
-}
-
-static int bch2_subvolume_reparent(struct btree_trans *trans,
-                                  struct btree_iter *iter,
-                                  struct bkey_s_c k,
-                                  u32 old_parent, u32 new_parent)
-{
-       struct bkey_i_subvolume *s;
-       int ret;
-
-       if (k.k->type != KEY_TYPE_subvolume)
-               return 0;
-
-       if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
-           le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
-               return 0;
-
-       s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
-       ret = PTR_ERR_OR_ZERO(s);
-       if (ret)
-               return ret;
-
-       s->v.creation_parent = cpu_to_le32(new_parent);
-       return 0;
-}
-
-/*
- * Separate from the snapshot tree in the snapshots btree, we record the tree
- * structure of how snapshot subvolumes were created - the parent subvolume of
- * each snapshot subvolume.
- *
- * When a subvolume is deleted, we scan for child subvolumes and reparant them,
- * to avoid dangling references:
- */
-static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
-{
-       struct bch_subvolume s;
-
-       return lockrestart_do(trans,
-                       bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?:
-               for_each_btree_key_commit(trans, iter,
-                               BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
-                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                       bch2_subvolume_reparent(trans, &iter, k,
-                                       subvolid_to_delete, le32_to_cpu(s.creation_parent)));
-}
-
-/*
- * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
- * deletion/cleanup:
- */
-static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
-{
-       struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {};
-
-       struct bkey_s_c_subvolume subvol =
-               bch2_bkey_get_iter_typed(trans, &subvol_iter,
-                               BTREE_ID_subvolumes, POS(0, subvolid),
-                               BTREE_ITER_cached|BTREE_ITER_intent,
-                               subvolume);
-       int ret = bkey_err(subvol);
-       if (bch2_err_matches(ret, ENOENT))
-               ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
-       if (ret)
-               goto err;
-
-       u32 snapid = le32_to_cpu(subvol.v->snapshot);
-
-       struct bkey_s_c_snapshot snapshot =
-               bch2_bkey_get_iter_typed(trans, &snapshot_iter,
-                               BTREE_ID_snapshots, POS(0, snapid),
-                               0, snapshot);
-       ret = bkey_err(snapshot);
-       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
-                               "missing snapshot %u", snapid);
-       if (ret)
-               goto err;
-
-       u32 treeid = le32_to_cpu(snapshot.v->tree);
-
-       struct bkey_s_c_snapshot_tree snapshot_tree =
-               bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
-                               BTREE_ID_snapshot_trees, POS(0, treeid),
-                               0, snapshot_tree);
-       ret = bkey_err(snapshot_tree);
-       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
-                               "missing snapshot tree %u", treeid);
-       if (ret)
-               goto err;
-
-       if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
-               struct bkey_i_snapshot_tree *snapshot_tree_mut =
-                       bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter,
-                                                &snapshot_tree.s_c,
-                                                0, snapshot_tree);
-               ret = PTR_ERR_OR_ZERO(snapshot_tree_mut);
-               if (ret)
-                       goto err;
-
-               snapshot_tree_mut->v.master_subvol = 0;
-       }
-
-       ret =   bch2_btree_delete_at(trans, &subvol_iter, 0) ?:
-               bch2_snapshot_node_set_deleted(trans, snapid);
-err:
-       bch2_trans_iter_exit(trans, &snapshot_tree_iter);
-       bch2_trans_iter_exit(trans, &snapshot_iter);
-       bch2_trans_iter_exit(trans, &subvol_iter);
-       return ret;
-}
-
-static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
-{
-       int ret = bch2_subvolumes_reparent(trans, subvolid) ?:
-               commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                         __bch2_subvolume_delete(trans, subvolid));
-
-       bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols);
-       return ret;
-}
-
-static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
-{
-       struct bch_fs *c = container_of(work, struct bch_fs,
-                               snapshot_wait_for_pagecache_and_delete_work);
-       int ret = 0;
-
-       while (!ret) {
-               mutex_lock(&c->snapshots_unlinked_lock);
-               snapshot_id_list s = c->snapshots_unlinked;
-               darray_init(&c->snapshots_unlinked);
-               mutex_unlock(&c->snapshots_unlinked_lock);
-
-               if (!s.nr)
-                       break;
-
-               bch2_evict_subvolume_inodes(c, &s);
-
-               darray_for_each(s, id) {
-                       ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
-                       bch_err_msg(c, ret, "deleting subvolume %u", *id);
-                       if (ret)
-                               break;
-               }
-
-               darray_exit(&s);
-       }
-
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
-}
-
-struct subvolume_unlink_hook {
-       struct btree_trans_commit_hook  h;
-       u32                             subvol;
-};
-
-static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
-                                                     struct btree_trans_commit_hook *_h)
-{
-       struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
-       struct bch_fs *c = trans->c;
-       int ret = 0;
-
-       mutex_lock(&c->snapshots_unlinked_lock);
-       if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
-               ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
-       mutex_unlock(&c->snapshots_unlinked_lock);
-
-       if (ret)
-               return ret;
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache))
-               return -EROFS;
-
-       if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
-               enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
-       return 0;
-}
-
-int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
-{
-       struct btree_iter iter;
-       struct bkey_i_subvolume *n;
-       struct subvolume_unlink_hook *h;
-       int ret = 0;
-
-       h = bch2_trans_kmalloc(trans, sizeof(*h));
-       ret = PTR_ERR_OR_ZERO(h);
-       if (ret)
-               return ret;
-
-       h->h.fn         = bch2_subvolume_wait_for_pagecache_and_delete_hook;
-       h->subvol       = subvolid;
-       bch2_trans_commit_hook(trans, &h->h);
-
-       n = bch2_bkey_get_mut_typed(trans, &iter,
-                       BTREE_ID_subvolumes, POS(0, subvolid),
-                       BTREE_ITER_cached, subvolume);
-       ret = PTR_ERR_OR_ZERO(n);
-       if (bch2_err_matches(ret, ENOENT))
-               ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
-       if (unlikely(ret))
-               return ret;
-
-       SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
-       n->v.fs_path_parent = 0;
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
-                         u32 parent_subvolid,
-                         u32 src_subvolid,
-                         u32 *new_subvolid,
-                         u32 *new_snapshotid,
-                         bool ro)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter dst_iter, src_iter = {};
-       struct bkey_i_subvolume *new_subvol = NULL;
-       struct bkey_i_subvolume *src_subvol = NULL;
-       u32 parent = 0, new_nodes[2], snapshot_subvols[2];
-       int ret = 0;
-
-       ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
-                               BTREE_ID_subvolumes, POS(0, U32_MAX));
-       if (ret == -BCH_ERR_ENOSPC_btree_slot)
-               ret = bch_err_throw(c, ENOSPC_subvolume_create);
-       if (ret)
-               return ret;
-
-       snapshot_subvols[0] = dst_iter.pos.offset;
-       snapshot_subvols[1] = src_subvolid;
-
-       if (src_subvolid) {
-               /* Creating a snapshot: */
-
-               src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
-                               BTREE_ID_subvolumes, POS(0, src_subvolid),
-                               BTREE_ITER_cached, subvolume);
-               ret = PTR_ERR_OR_ZERO(src_subvol);
-               if (bch2_err_matches(ret, ENOENT))
-                       ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret;
-               if (unlikely(ret))
-                       goto err;
-
-               parent = le32_to_cpu(src_subvol->v.snapshot);
-       }
-
-       ret = bch2_snapshot_node_create(trans, parent, new_nodes,
-                                       snapshot_subvols,
-                                       src_subvolid ? 2 : 1);
-       if (ret)
-               goto err;
-
-       if (src_subvolid) {
-               src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
-               ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
-               if (ret)
-                       goto err;
-       }
-
-       new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
-       ret = PTR_ERR_OR_ZERO(new_subvol);
-       if (ret)
-               goto err;
-
-       new_subvol->v.flags             = 0;
-       new_subvol->v.snapshot          = cpu_to_le32(new_nodes[0]);
-       new_subvol->v.inode             = cpu_to_le64(inode);
-       new_subvol->v.creation_parent   = cpu_to_le32(src_subvolid);
-       new_subvol->v.fs_path_parent    = cpu_to_le32(parent_subvolid);
-       new_subvol->v.otime.lo          = cpu_to_le64(bch2_current_time(c));
-       new_subvol->v.otime.hi          = 0;
-
-       SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
-       SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
-
-       *new_subvolid   = new_subvol->k.p.offset;
-       *new_snapshotid = new_nodes[0];
-err:
-       bch2_trans_iter_exit(trans, &src_iter);
-       bch2_trans_iter_exit(trans, &dst_iter);
-       return ret;
-}
-
-int bch2_initialize_subvolumes(struct bch_fs *c)
-{
-       struct bkey_i_snapshot_tree     root_tree;
-       struct bkey_i_snapshot          root_snapshot;
-       struct bkey_i_subvolume         root_volume;
-       int ret;
-
-       bkey_snapshot_tree_init(&root_tree.k_i);
-       root_tree.k.p.offset            = 1;
-       root_tree.v.master_subvol       = cpu_to_le32(1);
-       root_tree.v.root_snapshot       = cpu_to_le32(U32_MAX);
-
-       bkey_snapshot_init(&root_snapshot.k_i);
-       root_snapshot.k.p.offset = U32_MAX;
-       root_snapshot.v.flags   = 0;
-       root_snapshot.v.parent  = 0;
-       root_snapshot.v.subvol  = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
-       root_snapshot.v.tree    = cpu_to_le32(1);
-       SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
-
-       bkey_subvolume_init(&root_volume.k_i);
-       root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
-       root_volume.v.flags     = 0;
-       root_volume.v.snapshot  = cpu_to_le32(U32_MAX);
-       root_volume.v.inode     = cpu_to_le64(BCACHEFS_ROOT_INO);
-
-       ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,   &root_tree.k_i, NULL, 0, 0) ?:
-               bch2_btree_insert(c, BTREE_ID_snapshots,        &root_snapshot.k_i, NULL, 0, 0) ?:
-               bch2_btree_insert(c, BTREE_ID_subvolumes,       &root_volume.k_i, NULL, 0, 0);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bch_inode_unpacked inode;
-       int ret;
-
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-                              SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       if (!bkey_is_inode(k.k)) {
-               struct bch_fs *c = trans->c;
-               bch_err(c, "root inode not found");
-               ret = bch_err_throw(c, ENOENT_inode);
-               goto err;
-       }
-
-       ret = bch2_inode_unpack(k, &inode);
-       BUG_ON(ret);
-
-       inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
-
-       ret = bch2_inode_write(trans, &iter, &inode);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/* set bi_subvol on root inode */
-int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
-{
-       int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-                                      __bch2_fs_upgrade_for_subvolumes(trans));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-void bch2_fs_subvolumes_init_early(struct bch_fs *c)
-{
-       INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
-                 bch2_subvolume_wait_for_pagecache_and_delete);
-}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
deleted file mode 100644 (file)
index 075f55e..0000000
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_H
-#define _BCACHEFS_SUBVOLUME_H
-
-#include "darray.h"
-#include "subvolume_types.h"
-
-int bch2_check_subvols(struct bch_fs *);
-int bch2_check_subvol_children(struct bch_fs *);
-
-int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c,
-                           struct bkey_validate_context);
-void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
-                          struct bkey_s_c, struct bkey_s,
-                          enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_subvolume ((struct bkey_ops) {           \
-       .key_validate   = bch2_subvolume_validate,              \
-       .val_to_text    = bch2_subvolume_to_text,               \
-       .trigger        = bch2_subvolume_trigger,               \
-       .min_val_size   = 16,                                   \
-})
-
-int bch2_subvol_has_children(struct btree_trans *, u32);
-int bch2_subvolume_get(struct btree_trans *, unsigned,
-                      bool, struct bch_subvolume *);
-int __bch2_subvolume_get_snapshot(struct btree_trans *, u32,
-                                 u32 *, bool);
-int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
-
-int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
-int bch2_subvol_is_ro(struct bch_fs *, u32);
-
-static inline struct bkey_s_c
-bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter,
-                                          struct bpos end, u32 subvolid, unsigned flags)
-{
-       u32 snapshot;
-       int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot);
-       if (ret)
-               return bkey_s_c_err(ret);
-
-       bch2_btree_iter_set_snapshot(trans, iter, snapshot);
-       return bch2_btree_iter_peek_max_type(trans, iter, end, flags);
-}
-
-#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter,            \
-                                        _end, _subvolid, _flags, _k, _do)      \
-({                                                                             \
-       struct bkey_s_c _k;                                                     \
-       int _ret3 = 0;                                                          \
-                                                                               \
-       do {                                                                    \
-               _ret3 = lockrestart_do(_trans, ({                               \
-                       (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\
-                                               _end, _subvolid, (_flags));     \
-                       if (!(_k).k)                                            \
-                               break;                                          \
-                                                                               \
-                       bkey_err(_k) ?: (_do);                                  \
-               }));                                                            \
-       } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter)));          \
-                                                                               \
-       bch2_trans_iter_exit((_trans), &(_iter));                               \
-       _ret3;                                                                  \
-})
-
-#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id,          \
-                               _start, _end, _subvolid, _flags, _k, _do)       \
-({                                                                             \
-       struct btree_iter _iter;                                                \
-       bch2_trans_iter_init((_trans), &(_iter), (_btree_id),                   \
-                            (_start), (_flags));                               \
-                                                                               \
-       for_each_btree_key_in_subvolume_max_continue(_trans, _iter,             \
-                                       _end, _subvolid, _flags, _k, _do);      \
-})
-
-int bch2_subvolume_unlink(struct btree_trans *, u32);
-int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
-
-int bch2_initialize_subvolumes(struct bch_fs *);
-int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
-
-void bch2_fs_subvolumes_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
deleted file mode 100644 (file)
index e029df7..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
-#define _BCACHEFS_SUBVOLUME_FORMAT_H
-
-#define SUBVOL_POS_MIN         POS(0, 1)
-#define SUBVOL_POS_MAX         POS(0, S32_MAX)
-#define BCACHEFS_ROOT_SUBVOL   1
-
-struct bch_subvolume {
-       struct bch_val          v;
-       __le32                  flags;
-       __le32                  snapshot;
-       __le64                  inode;
-       /*
-        * Snapshot subvolumes form a tree, separate from the snapshot nodes
-        * tree - if this subvolume is a snapshot, this is the ID of the
-        * subvolume it was created from:
-        *
-        * This is _not_ necessarily the subvolume of the directory containing
-        * this subvolume:
-        */
-       __le32                  creation_parent;
-       __le32                  fs_path_parent;
-       bch_le128               otime;
-};
-
-LE32_BITMASK(BCH_SUBVOLUME_RO,         struct bch_subvolume, flags,  0,  1)
-/*
- * We need to know whether a subvolume is a snapshot so we can know whether we
- * can delete it (or whether it should just be rm -rf'd)
- */
-LE32_BITMASK(BCH_SUBVOLUME_SNAP,       struct bch_subvolume, flags,  1,  2)
-LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,   struct bch_subvolume, flags,  2,  3)
-
-#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
deleted file mode 100644 (file)
index 9d634b9..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
-#define _BCACHEFS_SUBVOLUME_TYPES_H
-
-typedef struct {
-       /* we can't have padding in this struct: */
-       u64             subvol;
-       u64             inum;
-} subvol_inum;
-
-#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
deleted file mode 100644 (file)
index 6c2e1d6..0000000
+++ /dev/null
@@ -1,1562 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "checksum.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_sb.h"
-#include "journal_seq_blacklist.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "quota.h"
-#include "sb-clean.h"
-#include "sb-counters.h"
-#include "sb-downgrade.h"
-#include "sb-errors.h"
-#include "sb-members.h"
-#include "super-io.h"
-#include "super.h"
-#include "trace.h"
-#include "vstructs.h"
-
-#include <linux/backing-dev.h>
-#include <linux/sort.h>
-#include <linux/string_choices.h>
-
-struct bch2_metadata_version {
-       u16             version;
-       const char      *name;
-};
-
-static const struct bch2_metadata_version bch2_metadata_versions[] = {
-#define x(n, v) {              \
-       .version = v,                           \
-       .name = #n,                             \
-},
-       BCH_METADATA_VERSIONS()
-#undef x
-};
-
-void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v)
-{
-       const char *str = "(unknown version)";
-
-       for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
-               if (bch2_metadata_versions[i].version == v) {
-                       str = bch2_metadata_versions[i].name;
-                       break;
-               }
-
-       prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
-}
-
-enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v)
-{
-       if (!BCH_VERSION_MAJOR(v))
-               return v;
-
-       for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
-               if (bch2_metadata_versions[i].version > v &&
-                   BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) ==
-                   BCH_VERSION_MAJOR(v))
-                       v = bch2_metadata_versions[i].version;
-
-       return v;
-}
-
-int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
-{
-       int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
-                  version <= c->sb.version_incompat_allowed)
-               ? 0
-               : -BCH_ERR_may_not_use_incompat_feature;
-
-       mutex_lock(&c->sb_lock);
-       if (!ret) {
-               SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
-                       max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
-               bch2_write_super(c);
-       } else {
-               darray_for_each(c->incompat_versions_requested, i)
-                       if (version == *i)
-                               goto out;
-
-               darray_push(&c->incompat_versions_requested, version);
-               struct printbuf buf = PRINTBUF;
-               prt_str(&buf, "requested incompat feature ");
-               bch2_version_to_text(&buf, version);
-               prt_str(&buf, " currently not enabled, allowed up to ");
-               bch2_version_to_text(&buf, version);
-               prt_printf(&buf, "\n  set version_upgrade=incompat to enable");
-
-               bch_notice(c, "%s", buf.buf);
-               printbuf_exit(&buf);
-       }
-
-out:
-       mutex_unlock(&c->sb_lock);
-
-       return ret;
-}
-
-const char * const bch2_sb_fields[] = {
-#define x(name, nr)    #name,
-       BCH_SB_FIELDS()
-#undef x
-       NULL
-};
-
-static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
-                                 enum bch_validate_flags, struct printbuf *);
-
-struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
-                                     enum bch_sb_field_type type)
-{
-       /* XXX: need locking around superblock to access optional fields */
-
-       vstruct_for_each(sb, f)
-               if (le32_to_cpu(f->type) == type)
-                       return f;
-       return NULL;
-}
-
-static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
-                                                  struct bch_sb_field *f,
-                                                  unsigned u64s)
-{
-       unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
-       unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
-
-       BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
-
-       if (!f && !u64s) {
-               /* nothing to do: */
-       } else if (!f) {
-               f = vstruct_last(sb->sb);
-               memset(f, 0, sizeof(u64) * u64s);
-               f->u64s = cpu_to_le32(u64s);
-               f->type = 0;
-       } else {
-               void *src, *dst;
-
-               src = vstruct_end(f);
-
-               if (u64s) {
-                       f->u64s = cpu_to_le32(u64s);
-                       dst = vstruct_end(f);
-               } else {
-                       dst = f;
-               }
-
-               memmove(dst, src, vstruct_end(sb->sb) - src);
-
-               if (dst > src)
-                       memset(src, 0, dst - src);
-       }
-
-       sb->sb->u64s = cpu_to_le32(sb_u64s);
-
-       return u64s ? f : NULL;
-}
-
-void bch2_sb_field_delete(struct bch_sb_handle *sb,
-                         enum bch_sb_field_type type)
-{
-       struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
-
-       if (f)
-               __bch2_sb_field_resize(sb, f, 0);
-}
-
-/* Superblock realloc/free: */
-
-void bch2_free_super(struct bch_sb_handle *sb)
-{
-       kfree(sb->bio);
-       if (!IS_ERR_OR_NULL(sb->s_bdev_file))
-               bdev_fput(sb->s_bdev_file);
-       kfree(sb->holder);
-       kfree(sb->sb_name);
-
-       kfree(sb->sb);
-       memset(sb, 0, sizeof(*sb));
-}
-
-int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
-{
-       size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
-       size_t new_buffer_size;
-       struct bch_sb *new_sb;
-       struct bio *bio;
-
-       if (sb->bdev)
-               new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
-
-       new_buffer_size = roundup_pow_of_two(new_bytes);
-
-       if (sb->sb && sb->buffer_size >= new_buffer_size)
-               return 0;
-
-       if (sb->sb && sb->have_layout) {
-               u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
-
-               if (new_bytes > max_bytes) {
-                       struct printbuf buf = PRINTBUF;
-
-                       prt_bdevname(&buf, sb->bdev);
-                       prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes);
-                       pr_err("%s", buf.buf);
-                       printbuf_exit(&buf);
-                       return -BCH_ERR_ENOSPC_sb;
-               }
-       }
-
-       if (sb->buffer_size >= new_buffer_size && sb->sb)
-               return 0;
-
-       if (dynamic_fault("bcachefs:add:super_realloc"))
-               return -BCH_ERR_ENOMEM_sb_realloc_injected;
-
-       new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
-       if (!new_sb)
-               return -BCH_ERR_ENOMEM_sb_buf_realloc;
-
-       sb->sb = new_sb;
-
-       if (sb->have_bio) {
-               unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
-
-               bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
-               if (!bio)
-                       return -BCH_ERR_ENOMEM_sb_bio_realloc;
-
-               bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
-
-               kfree(sb->bio);
-               sb->bio = bio;
-       }
-
-       sb->buffer_size = new_buffer_size;
-
-       return 0;
-}
-
-struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
-                                         enum bch_sb_field_type type,
-                                         unsigned u64s)
-{
-       struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
-       ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
-       ssize_t d = -old_u64s + u64s;
-
-       if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
-               return NULL;
-
-       if (sb->fs_sb) {
-               struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
-
-               lockdep_assert_held(&c->sb_lock);
-
-               /* XXX: we're not checking that offline device have enough space */
-
-               for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) {
-                       struct bch_sb_handle *dev_sb = &ca->disk_sb;
-
-                       if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
-                               enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize);
-                               return NULL;
-                       }
-               }
-       }
-
-       f = bch2_sb_field_get_id(sb->sb, type);
-       f = __bch2_sb_field_resize(sb, f, u64s);
-       if (f)
-               f->type = cpu_to_le32(type);
-       return f;
-}
-
-struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb,
-                                                 enum bch_sb_field_type type,
-                                                 unsigned u64s)
-{
-       struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
-
-       if (!f || le32_to_cpu(f->u64s) < u64s)
-               f = bch2_sb_field_resize_id(sb, type, u64s);
-       return f;
-}
-
-/* Superblock validate: */
-
-static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
-{
-       u64 offset, prev_offset, max_sectors;
-       unsigned i;
-
-       BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
-
-       if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
-           !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
-               prt_printf(out, "Not a bcachefs superblock layout");
-               return -BCH_ERR_invalid_sb_layout;
-       }
-
-       if (layout->layout_type != 0) {
-               prt_printf(out, "Invalid superblock layout type %u",
-                      layout->layout_type);
-               return -BCH_ERR_invalid_sb_layout_type;
-       }
-
-       if (!layout->nr_superblocks) {
-               prt_printf(out, "Invalid superblock layout: no superblocks");
-               return -BCH_ERR_invalid_sb_layout_nr_superblocks;
-       }
-
-       if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
-               prt_printf(out, "Invalid superblock layout: too many superblocks");
-               return -BCH_ERR_invalid_sb_layout_nr_superblocks;
-       }
-
-       if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) {
-               prt_printf(out, "Invalid superblock layout: max_size_bits too high");
-               return -BCH_ERR_invalid_sb_layout_sb_max_size_bits;
-       }
-
-       max_sectors = 1 << layout->sb_max_size_bits;
-
-       prev_offset = le64_to_cpu(layout->sb_offset[0]);
-
-       for (i = 1; i < layout->nr_superblocks; i++) {
-               offset = le64_to_cpu(layout->sb_offset[i]);
-
-               if (offset < prev_offset + max_sectors) {
-                       prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
-                              "  (sb %u ends at %llu next starts at %llu",
-                              i - 1, prev_offset + max_sectors, offset);
-                       return -BCH_ERR_invalid_sb_layout_superblocks_overlap;
-               }
-               prev_offset = offset;
-       }
-
-       return 0;
-}
-
-static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
-{
-       u16 version             = le16_to_cpu(sb->version);
-       u16 version_min         = le16_to_cpu(sb->version_min);
-
-       if (!bch2_version_compatible(version)) {
-               prt_str(out, "Unsupported superblock version ");
-               bch2_version_to_text(out, version);
-               prt_str(out, " (min ");
-               bch2_version_to_text(out, bcachefs_metadata_version_min);
-               prt_str(out, ", max ");
-               bch2_version_to_text(out, bcachefs_metadata_version_current);
-               prt_str(out, ")");
-               return -BCH_ERR_invalid_sb_version;
-       }
-
-       if (!bch2_version_compatible(version_min)) {
-               prt_str(out, "Unsupported superblock version_min ");
-               bch2_version_to_text(out, version_min);
-               prt_str(out, " (min ");
-               bch2_version_to_text(out, bcachefs_metadata_version_min);
-               prt_str(out, ", max ");
-               bch2_version_to_text(out, bcachefs_metadata_version_current);
-               prt_str(out, ")");
-               return -BCH_ERR_invalid_sb_version;
-       }
-
-       if (version_min > version) {
-               prt_str(out, "Bad minimum version ");
-               bch2_version_to_text(out, version_min);
-               prt_str(out, ", greater than version field ");
-               bch2_version_to_text(out, version);
-               return -BCH_ERR_invalid_sb_version;
-       }
-
-       return 0;
-}
-
-int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
-                    enum bch_validate_flags flags, struct printbuf *out)
-{
-       enum bch_opt_id opt_id;
-       int ret;
-
-       ret = bch2_sb_compatible(sb, out);
-       if (ret)
-               return ret;
-
-       u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
-       unsigned incompat_bit = 0;
-       if (incompat)
-               incompat_bit = __ffs64(incompat);
-       else if (sb->features[1])
-               incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
-
-       if (incompat_bit) {
-               prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
-                          incompat_bit,
-                          bch2_sb_features[BCH_FEATURE_NR - 1],
-                          BCH_FEATURE_NR - 1);
-               return -BCH_ERR_invalid_sb_features;
-       }
-
-       if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
-           BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
-               prt_str(out, "Filesystem has incompatible version ");
-               bch2_version_to_text(out, le16_to_cpu(sb->version));
-               prt_str(out, ", current version ");
-               bch2_version_to_text(out, bcachefs_metadata_version_current);
-               return -BCH_ERR_invalid_sb_features;
-       }
-
-       if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
-               prt_printf(out, "Bad user UUID (got zeroes)");
-               return -BCH_ERR_invalid_sb_uuid;
-       }
-
-       if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
-               prt_printf(out, "Bad internal UUID (got zeroes)");
-               return -BCH_ERR_invalid_sb_uuid;
-       }
-
-       if (!(flags & BCH_VALIDATE_write) &&
-           le64_to_cpu(sb->offset) != read_offset) {
-               prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
-                          le64_to_cpu(sb->offset), read_offset);
-               return -BCH_ERR_invalid_sb_offset;
-       }
-
-       if (!sb->nr_devices ||
-           sb->nr_devices > BCH_SB_MEMBERS_MAX) {
-               prt_printf(out, "Bad number of member devices %u (max %u)",
-                      sb->nr_devices, BCH_SB_MEMBERS_MAX);
-               return -BCH_ERR_invalid_sb_too_many_members;
-       }
-
-       if (sb->dev_idx >= sb->nr_devices) {
-               prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
-                      sb->dev_idx, sb->nr_devices);
-               return -BCH_ERR_invalid_sb_dev_idx;
-       }
-
-       if (!sb->time_precision ||
-           le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
-               prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
-                      le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
-               return -BCH_ERR_invalid_sb_time_precision;
-       }
-
-       /* old versions didn't know to downgrade this field */
-       if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version))
-               SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version));
-
-       if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) {
-               prt_printf(out, "Invalid version_incompat ");
-               bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
-               prt_str(out, " > incompat_allowed ");
-               bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
-               if (flags & BCH_VALIDATE_write)
-                       return -BCH_ERR_invalid_sb_version;
-               else
-                       SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb));
-       }
-
-       if (sb->nr_devices > 1)
-               SET_BCH_SB_MULTI_DEVICE(sb, true);
-
-       if (!flags) {
-               /*
-                * Been seeing a bug where these are getting inexplicably
-                * zeroed, so we're now validating them, but we have to be
-                * careful not to preven people's filesystems from mounting:
-                */
-               if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
-                       SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
-               if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
-                       SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
-
-               if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
-                       SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
-
-               if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
-                   !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
-                       SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
-
-               if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
-                       SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
-
-               if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
-                       SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
-
-               if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
-                   !BCH_SB_CSUM_ERR_RETRY_NR(sb))
-                       SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
-       }
-
-#ifdef __KERNEL__
-       if (!BCH_SB_SHARD_INUMS_NBITS(sb))
-               SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus())));
-#endif
-
-       for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
-               const struct bch_option *opt = bch2_opt_table + opt_id;
-
-               if (opt->get_sb) {
-                       u64 v = bch2_opt_from_sb(sb, opt_id, -1);
-
-                       prt_printf(out, "Invalid option ");
-                       ret = bch2_opt_validate(opt, v, out);
-                       if (ret)
-                               return ret;
-
-                       printbuf_reset(out);
-               }
-       }
-
-       /* validate layout */
-       ret = validate_sb_layout(&sb->layout, out);
-       if (ret)
-               return ret;
-
-       vstruct_for_each(sb, f) {
-               if (!f->u64s) {
-                       prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)",
-                              le32_to_cpu(f->type));
-                       return -BCH_ERR_invalid_sb_field_size;
-               }
-
-               if (vstruct_next(f) > vstruct_last(sb)) {
-                       prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
-                              le32_to_cpu(f->type));
-                       return -BCH_ERR_invalid_sb_field_size;
-               }
-       }
-
-       struct bch_sb_field *mi =
-               bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?:
-               bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1);
-
-       /* members must be validated first: */
-       if (!mi) {
-               prt_printf(out, "Invalid superblock: member info area missing");
-               return -BCH_ERR_invalid_sb_members_missing;
-       }
-
-       ret = bch2_sb_field_validate(sb, mi, flags, out);
-       if (ret)
-               return ret;
-
-       vstruct_for_each(sb, f) {
-               if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
-                       continue;
-
-               ret = bch2_sb_field_validate(sb, f, flags, out);
-               if (ret)
-                       return ret;
-       }
-
-       if ((flags & BCH_VALIDATE_write) &&
-           bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
-               prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
-                          le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
-                          le64_to_cpu(sb->seq));
-               return -BCH_ERR_invalid_sb_members_missing;
-       }
-
-       return 0;
-}
-
-/* device open: */
-
-static unsigned long le_ulong_to_cpu(unsigned long v)
-{
-       return sizeof(unsigned long) == 8
-               ? le64_to_cpu(v)
-               : le32_to_cpu(v);
-}
-
-static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr)
-{
-       BUG_ON(nr & (BITS_PER_TYPE(long) - 1));
-
-       for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++)
-               dst[i] = le_ulong_to_cpu(src[i]);
-}
-
-static void bch2_sb_update(struct bch_fs *c)
-{
-       struct bch_sb *src = c->disk_sb.sb;
-
-       lockdep_assert_held(&c->sb_lock);
-
-       c->sb.uuid              = src->uuid;
-       c->sb.user_uuid         = src->user_uuid;
-       c->sb.version           = le16_to_cpu(src->version);
-       c->sb.version_incompat  = BCH_SB_VERSION_INCOMPAT(src);
-       c->sb.version_incompat_allowed
-                               = BCH_SB_VERSION_INCOMPAT_ALLOWED(src);
-       c->sb.version_min       = le16_to_cpu(src->version_min);
-       c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
-       c->sb.nr_devices        = src->nr_devices;
-       c->sb.clean             = BCH_SB_CLEAN(src);
-       c->sb.encryption_type   = BCH_SB_ENCRYPTION_TYPE(src);
-
-       c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
-       c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
-
-       /* XXX this is wrong, we need a 96 or 128 bit integer type */
-       c->sb.time_base_lo      = div_u64(le64_to_cpu(src->time_base_lo),
-                                         c->sb.nsec_per_time_unit);
-       c->sb.time_base_hi      = le32_to_cpu(src->time_base_hi);
-
-       c->sb.features          = le64_to_cpu(src->features[0]);
-       c->sb.compat            = le64_to_cpu(src->compat[0]);
-       c->sb.multi_device      = BCH_SB_MULTI_DEVICE(src);
-
-       memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
-
-       struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
-       if (ext) {
-               c->sb.recovery_passes_required =
-                       bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
-               le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
-                                   sizeof(c->sb.errors_silent) * 8);
-               c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
-       }
-
-       for_each_member_device(c, ca) {
-               struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
-               ca->mi = bch2_mi_to_cpu(&m);
-       }
-}
-
-static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
-{
-       struct bch_sb_field *src_f, *dst_f;
-       struct bch_sb *dst = dst_handle->sb;
-       unsigned i;
-
-       dst->version            = src->version;
-       dst->version_min        = src->version_min;
-       dst->seq                = src->seq;
-       dst->uuid               = src->uuid;
-       dst->user_uuid          = src->user_uuid;
-       memcpy(dst->label,      src->label, sizeof(dst->label));
-
-       dst->block_size         = src->block_size;
-       dst->nr_devices         = src->nr_devices;
-
-       dst->time_base_lo       = src->time_base_lo;
-       dst->time_base_hi       = src->time_base_hi;
-       dst->time_precision     = src->time_precision;
-       dst->write_time         = src->write_time;
-
-       memcpy(dst->flags,      src->flags,     sizeof(dst->flags));
-       memcpy(dst->features,   src->features,  sizeof(dst->features));
-       memcpy(dst->compat,     src->compat,    sizeof(dst->compat));
-
-       for (i = 0; i < BCH_SB_FIELD_NR; i++) {
-               int d;
-
-               if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
-                       continue;
-
-               src_f = bch2_sb_field_get_id(src, i);
-               dst_f = bch2_sb_field_get_id(dst, i);
-
-               d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
-                   (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
-               if (d > 0) {
-                       int ret = bch2_sb_realloc(dst_handle,
-                                       le32_to_cpu(dst_handle->sb->u64s) + d);
-
-                       if (ret)
-                               return ret;
-
-                       dst = dst_handle->sb;
-                       dst_f = bch2_sb_field_get_id(dst, i);
-               }
-
-               dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
-                               src_f ? le32_to_cpu(src_f->u64s) : 0);
-
-               if (src_f)
-                       memcpy(dst_f, src_f, vstruct_bytes(src_f));
-       }
-
-       return 0;
-}
-
-int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
-{
-       int ret;
-
-       lockdep_assert_held(&c->sb_lock);
-
-       ret =   bch2_sb_realloc(&c->disk_sb, 0) ?:
-               __copy_super(&c->disk_sb, src) ?:
-               bch2_sb_replicas_to_cpu_replicas(c) ?:
-               bch2_sb_disk_groups_to_cpu(c);
-       if (ret)
-               return ret;
-
-       bch2_sb_update(c);
-       return 0;
-}
-
-int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
-{
-       return __copy_super(&ca->disk_sb, c->disk_sb.sb);
-}
-
-/* read superblock: */
-
-static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
-{
-       size_t bytes;
-       int ret;
-reread:
-       bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
-       sb->bio->bi_iter.bi_sector = offset;
-       bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
-
-       ret = submit_bio_wait(sb->bio);
-       if (ret) {
-               prt_printf(err, "IO error: %i", ret);
-               return ret;
-       }
-
-       if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
-           !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
-               prt_str(err, "Not a bcachefs superblock (got magic ");
-               pr_uuid(err, sb->sb->magic.b);
-               prt_str(err, ")");
-               return -BCH_ERR_invalid_sb_magic;
-       }
-
-       ret = bch2_sb_compatible(sb->sb, err);
-       if (ret)
-               return ret;
-
-       bytes = vstruct_bytes(sb->sb);
-
-       u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits);
-       if (bytes > sb_size) {
-               prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)",
-                          bytes, sb_size);
-               return -BCH_ERR_invalid_sb_too_big;
-       }
-
-       if (bytes > sb->buffer_size) {
-               ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s));
-               if (ret)
-                       return ret;
-               goto reread;
-       }
-
-       enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
-       if (csum_type >= BCH_CSUM_NR ||
-           bch2_csum_type_is_encryption(csum_type)) {
-               prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
-               return -BCH_ERR_invalid_sb_csum_type;
-       }
-
-       /* XXX: verify MACs */
-       struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb);
-       if (bch2_crc_cmp(csum, sb->sb->csum)) {
-               bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum);
-               return -BCH_ERR_invalid_sb_csum;
-       }
-
-       sb->seq = le64_to_cpu(sb->sb->seq);
-
-       return 0;
-}
-
-static int __bch2_read_super(const char *path, struct bch_opts *opts,
-                   struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
-{
-       u64 offset = opt_get(*opts, sb);
-       struct bch_sb_layout layout;
-       struct printbuf err = PRINTBUF;
-       struct printbuf err2 = PRINTBUF;
-       __le64 *i;
-       int ret;
-#ifndef __KERNEL__
-retry:
-#endif
-       memset(sb, 0, sizeof(*sb));
-       sb->mode        = BLK_OPEN_READ;
-       sb->have_bio    = true;
-       sb->holder      = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
-       if (!sb->holder)
-               return -ENOMEM;
-
-       sb->sb_name = kstrdup(path, GFP_KERNEL);
-       if (!sb->sb_name) {
-               ret = -ENOMEM;
-               prt_printf(&err, "error allocating memory for sb_name");
-               goto err;
-       }
-
-#ifndef __KERNEL__
-       if (opt_get(*opts, direct_io) == false)
-               sb->mode |= BLK_OPEN_BUFFERED;
-#endif
-
-       if (!opt_get(*opts, noexcl))
-               sb->mode |= BLK_OPEN_EXCL;
-
-       if (!opt_get(*opts, nochanges))
-               sb->mode |= BLK_OPEN_WRITE;
-
-       sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
-       if (IS_ERR(sb->s_bdev_file) &&
-           PTR_ERR(sb->s_bdev_file) == -EACCES &&
-           opt_get(*opts, read_only)) {
-               sb->mode &= ~BLK_OPEN_WRITE;
-
-               sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
-               if (!IS_ERR(sb->s_bdev_file))
-                       opt_set(*opts, nochanges, true);
-       }
-
-       if (IS_ERR(sb->s_bdev_file)) {
-               ret = PTR_ERR(sb->s_bdev_file);
-               prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
-               goto err;
-       }
-       sb->bdev = file_bdev(sb->s_bdev_file);
-
-       ret = bch2_sb_realloc(sb, 0);
-       if (ret) {
-               prt_printf(&err, "error allocating memory for superblock");
-               goto err;
-       }
-
-       if (bch2_fs_init_fault("read_super")) {
-               prt_printf(&err, "dynamic fault");
-               ret = -EFAULT;
-               goto err;
-       }
-
-       ret = read_one_super(sb, offset, &err);
-       if (!ret)
-               goto got_super;
-
-       if (opt_defined(*opts, sb))
-               goto err;
-
-       prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
-              path, err.buf);
-       if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
-               bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
-       else
-               bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
-
-       printbuf_exit(&err2);
-       printbuf_reset(&err);
-
-       /*
-        * Error reading primary superblock - read location of backup
-        * superblocks:
-        */
-       bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
-       sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
-       /*
-        * use sb buffer to read layout, since sb buffer is page aligned but
-        * layout won't be:
-        */
-       bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
-
-       ret = submit_bio_wait(sb->bio);
-       if (ret) {
-               prt_printf(&err, "IO error: %i", ret);
-               goto err;
-       }
-
-       memcpy(&layout, sb->sb, sizeof(layout));
-       ret = validate_sb_layout(&layout, &err);
-       if (ret)
-               goto err;
-
-       for (i = layout.sb_offset;
-            i < layout.sb_offset + layout.nr_superblocks; i++) {
-               offset = le64_to_cpu(*i);
-
-               if (offset == opt_get(*opts, sb)) {
-                       ret = -BCH_ERR_invalid;
-                       continue;
-               }
-
-               ret = read_one_super(sb, offset, &err);
-               if (!ret)
-                       goto got_super;
-       }
-
-       goto err;
-
-got_super:
-       if (le16_to_cpu(sb->sb->block_size) << 9 <
-           bdev_logical_block_size(sb->bdev) &&
-           opt_get(*opts, direct_io)) {
-#ifndef __KERNEL__
-               opt_set(*opts, direct_io, false);
-               bch2_free_super(sb);
-               goto retry;
-#endif
-               prt_printf(&err, "block size (%u) smaller than device block size (%u)",
-                      le16_to_cpu(sb->sb->block_size) << 9,
-                      bdev_logical_block_size(sb->bdev));
-               ret = -BCH_ERR_block_size_too_small;
-               goto err;
-       }
-
-       sb->have_layout = true;
-
-       ret = bch2_sb_validate(sb->sb, offset, 0, &err);
-       if (ret) {
-               bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
-                               path, err.buf);
-               goto err_no_print;
-       }
-out:
-       printbuf_exit(&err);
-       return ret;
-err:
-       bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
-                       path, err.buf);
-err_no_print:
-       bch2_free_super(sb);
-       goto out;
-}
-
-int bch2_read_super(const char *path, struct bch_opts *opts,
-                   struct bch_sb_handle *sb)
-{
-       return __bch2_read_super(path, opts, sb, false);
-}
-
-/* provide a silenced version for mount.bcachefs */
-
-int bch2_read_super_silent(const char *path, struct bch_opts *opts,
-                   struct bch_sb_handle *sb)
-{
-       return __bch2_read_super(path, opts, sb, true);
-}
-
-/* write superblock: */
-
-static void write_super_endio(struct bio *bio)
-{
-       struct bch_dev *ca = bio->bi_private;
-
-       bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
-
-       /* XXX: return errors directly */
-
-       if (bio->bi_status) {
-               bch_err_dev_ratelimited(ca, "superblock %s error: %s",
-                              str_write_read(bio_data_dir(bio)),
-                              bch2_blk_status_to_str(bio->bi_status));
-               ca->sb_write_error = 1;
-       }
-
-       closure_put(&ca->fs->sb_write);
-       enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-}
-
-static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
-{
-       struct bch_sb *sb = ca->disk_sb.sb;
-       struct bio *bio = ca->disk_sb.bio;
-
-       memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE);
-
-       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
-       bio->bi_iter.bi_sector  = le64_to_cpu(sb->layout.sb_offset[0]);
-       bio->bi_end_io          = write_super_endio;
-       bio->bi_private         = ca;
-       bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE);
-
-       this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
-
-       enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-       closure_bio_submit(bio, &c->sb_write);
-}
-
-static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
-{
-       struct bch_sb *sb = ca->disk_sb.sb;
-       struct bio *bio = ca->disk_sb.bio;
-
-       sb->offset = sb->layout.sb_offset[idx];
-
-       SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
-       sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
-                               null_nonce(), sb);
-
-       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
-       bio->bi_iter.bi_sector  = le64_to_cpu(sb->offset);
-       bio->bi_end_io          = write_super_endio;
-       bio->bi_private         = ca;
-       bch2_bio_map(bio, sb,
-                    roundup((size_t) vstruct_bytes(sb),
-                            bdev_logical_block_size(ca->disk_sb.bdev)));
-
-       this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
-                    bio_sectors(bio));
-
-       enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-       closure_bio_submit(bio, &c->sb_write);
-}
-
-int bch2_write_super(struct bch_fs *c)
-{
-       struct closure *cl = &c->sb_write;
-       struct printbuf err = PRINTBUF;
-       unsigned sb = 0, nr_wrote;
-       struct bch_devs_mask sb_written;
-       bool wrote, can_mount_without_written, can_mount_with_written;
-       unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
-       DARRAY(struct bch_dev *) online_devices = {};
-       int ret = 0;
-
-       trace_and_count(c, write_super, c, _RET_IP_);
-
-       if (c->opts.degraded == BCH_DEGRADED_very)
-               degraded_flags |= BCH_FORCE_IF_LOST;
-
-       lockdep_assert_held(&c->sb_lock);
-
-       closure_init_stack(cl);
-       memset(&sb_written, 0, sizeof(sb_written));
-
-       /*
-        * Note: we do writes to RO devices here, and we might want to change
-        * that in the future.
-        *
-        * For now, we expect to be able to call write_super() when we're not
-        * yet RW:
-        */
-       for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) {
-               ret = darray_push(&online_devices, ca);
-               if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
-                       enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-                       goto out;
-               }
-               enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-       }
-
-       /* Make sure we're using the new magic numbers: */
-       c->disk_sb.sb->magic = BCHFS_MAGIC;
-       c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
-
-       le64_add_cpu(&c->disk_sb.sb->seq, 1);
-
-       struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-       darray_for_each(online_devices, ca)
-               __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq;
-       c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
-
-       if (test_bit(BCH_FS_error, &c->flags))
-               SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
-       if (test_bit(BCH_FS_topology_error, &c->flags))
-               SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
-
-       SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
-
-       bch2_sb_counters_from_cpu(c);
-       bch2_sb_members_from_cpu(c);
-       bch2_sb_members_cpy_v2_v1(&c->disk_sb);
-       bch2_sb_errors_from_cpu(c);
-       bch2_sb_downgrade_update(c);
-
-       darray_for_each(online_devices, ca)
-               bch2_sb_from_fs(c, (*ca));
-
-       darray_for_each(online_devices, ca) {
-               printbuf_reset(&err);
-
-               ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
-               if (ret) {
-                       bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
-                       goto out;
-               }
-       }
-
-       if (c->opts.nochanges)
-               goto out;
-
-       /*
-        * Defer writing the superblock until filesystem initialization is
-        * complete - don't write out a partly initialized superblock:
-        */
-       if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
-               goto out;
-
-       if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) {
-               struct printbuf buf = PRINTBUF;
-               prt_printf(&buf, "attempting to write superblock that wasn't version downgraded (");
-               bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version));
-               prt_str(&buf, " > ");
-               bch2_version_to_text(&buf, bcachefs_metadata_version_current);
-               prt_str(&buf, ")");
-               bch2_fs_fatal_error(c, ": %s", buf.buf);
-               printbuf_exit(&buf);
-               ret = bch_err_throw(c, sb_not_downgraded);
-               goto out;
-       }
-
-       darray_for_each(online_devices, ca) {
-               __set_bit((*ca)->dev_idx, sb_written.d);
-               (*ca)->sb_write_error = 0;
-       }
-
-       darray_for_each(online_devices, ca)
-               read_back_super(c, *ca);
-       closure_sync(cl);
-
-       darray_for_each(online_devices, cap) {
-               struct bch_dev *ca = *cap;
-
-               if (ca->sb_write_error)
-                       continue;
-
-               if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
-                       struct printbuf buf = PRINTBUF;
-                       prt_char(&buf, ' ');
-                       prt_bdevname(&buf, ca->disk_sb.bdev);
-                       prt_printf(&buf,
-                               ": Superblock write was silently dropped! (seq %llu expected %llu)",
-                               le64_to_cpu(ca->sb_read_scratch->seq),
-                               ca->disk_sb.seq);
-
-                       if (c->opts.errors != BCH_ON_ERROR_continue &&
-                           c->opts.errors != BCH_ON_ERROR_fix_safe) {
-                               ret = bch_err_throw(c, erofs_sb_err);
-                               bch2_fs_fatal_error(c, "%s", buf.buf);
-                       } else {
-                               bch_err(c, "%s", buf.buf);
-                       }
-
-                       printbuf_exit(&buf);
-               }
-
-               if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
-                       struct printbuf buf = PRINTBUF;
-                       prt_char(&buf, ' ');
-                       prt_bdevname(&buf, ca->disk_sb.bdev);
-                       prt_printf(&buf,
-                               ": Superblock modified by another process (seq %llu expected %llu)",
-                               le64_to_cpu(ca->sb_read_scratch->seq),
-                               ca->disk_sb.seq);
-                       bch2_fs_fatal_error(c, "%s", buf.buf);
-                       printbuf_exit(&buf);
-                       ret = bch_err_throw(c, erofs_sb_err);
-               }
-       }
-
-       if (ret)
-               goto out;
-
-       do {
-               wrote = false;
-               darray_for_each(online_devices, cap) {
-                       struct bch_dev *ca = *cap;
-                       if (!ca->sb_write_error &&
-                           sb < ca->disk_sb.sb->layout.nr_superblocks) {
-                               write_one_super(c, ca, sb);
-                               wrote = true;
-                       }
-               }
-               closure_sync(cl);
-               sb++;
-       } while (wrote);
-
-       darray_for_each(online_devices, cap) {
-               struct bch_dev *ca = *cap;
-               if (ca->sb_write_error)
-                       __clear_bit(ca->dev_idx, sb_written.d);
-               else
-                       ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
-       }
-
-       nr_wrote = dev_mask_nr(&sb_written);
-
-       can_mount_with_written =
-               bch2_have_enough_devs(c, sb_written, degraded_flags, false);
-
-       for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
-               sb_written.d[i] = ~sb_written.d[i];
-
-       can_mount_without_written =
-               bch2_have_enough_devs(c, sb_written, degraded_flags, false);
-
-       /*
-        * If we would be able to mount _without_ the devices we successfully
-        * wrote superblocks to, we weren't able to write to enough devices:
-        *
-        * Exception: if we can mount without the successes because we haven't
-        * written anything (new filesystem), we continue if we'd be able to
-        * mount with the devices we did successfully write to:
-        */
-       if (bch2_fs_fatal_err_on(!nr_wrote ||
-                                !can_mount_with_written ||
-                                (can_mount_without_written &&
-                                 !can_mount_with_written), c,
-               ": Unable to write superblock to sufficient devices (from %ps)",
-               (void *) _RET_IP_))
-               ret = bch_err_throw(c, erofs_sb_err);
-out:
-       /* Make new options visible after they're persistent: */
-       bch2_sb_update(c);
-       darray_for_each(online_devices, ca)
-               enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super);
-       darray_exit(&online_devices);
-       printbuf_exit(&err);
-       return ret;
-}
-
-void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
-{
-       mutex_lock(&c->sb_lock);
-       if (!(c->sb.features & (1ULL << feat))) {
-               c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
-
-               bch2_write_super(c);
-       }
-       mutex_unlock(&c->sb_lock);
-}
-
-/* Downgrade if superblock is at a higher version than currently supported: */
-bool bch2_check_version_downgrade(struct bch_fs *c)
-{
-       bool ret = bcachefs_metadata_version_current < c->sb.version;
-
-       lockdep_assert_held(&c->sb_lock);
-
-       /*
-        * Downgrade, if superblock is at a higher version than currently
-        * supported:
-        *
-        * c->sb will be checked before we write the superblock, so update it as
-        * well:
-        */
-       if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
-               SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
-       if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current)
-               SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current);
-       if (c->sb.version > bcachefs_metadata_version_current)
-               c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
-       if (c->sb.version_min > bcachefs_metadata_version_current)
-               c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
-       c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
-       return ret;
-}
-
-void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
-{
-       lockdep_assert_held(&c->sb_lock);
-
-       if (BCH_VERSION_MAJOR(new_version) >
-           BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
-               bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
-
-       c->disk_sb.sb->version = cpu_to_le16(new_version);
-
-       if (incompat) {
-               c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
-               SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
-                       max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
-       }
-}
-
-void bch2_sb_upgrade_incompat(struct bch_fs *c)
-{
-       mutex_lock(&c->sb_lock);
-       if (c->sb.version == c->sb.version_incompat_allowed)
-               goto unlock;
-
-       struct printbuf buf = PRINTBUF;
-
-       prt_str(&buf, "Now allowing incompatible features up to ");
-       bch2_version_to_text(&buf, c->sb.version);
-       prt_str(&buf, ", previously allowed up to ");
-       bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
-       prt_newline(&buf);
-
-       bch_notice(c, "%s", buf.buf);
-       printbuf_exit(&buf);
-
-       c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
-       SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
-                       max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version));
-       bch2_write_super(c);
-unlock:
-       mutex_unlock(&c->sb_lock);
-}
-
-static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum bch_validate_flags flags, struct printbuf *err)
-{
-       if (vstruct_bytes(f) < 88) {
-               prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88);
-               return -BCH_ERR_invalid_sb_ext;
-       }
-
-       return 0;
-}
-
-static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
-                               struct bch_sb_field *f)
-{
-       struct bch_sb_field_ext *e = field_to_type(f, ext);
-
-       prt_printf(out, "Recovery passes required:\t");
-       prt_bitflags(out, bch2_recovery_passes,
-                    bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0])));
-       prt_newline(out);
-
-       unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL);
-       if (errors_silent) {
-               le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
-
-               prt_printf(out, "Errors to silently fix:\t");
-               prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent,
-                                   min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8));
-               prt_newline(out);
-
-               kfree(errors_silent);
-       }
-
-       prt_printf(out, "Btrees with missing data:\t");
-       prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
-       prt_newline(out);
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
-       .validate       = bch2_sb_ext_validate,
-       .to_text        = bch2_sb_ext_to_text,
-};
-
-static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
-#define x(f, nr)                                       \
-       [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
-       BCH_SB_FIELDS()
-#undef x
-};
-
-static const struct bch_sb_field_ops bch2_sb_field_null_ops;
-
-static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
-{
-       return likely(type < ARRAY_SIZE(bch2_sb_field_ops))
-               ? bch2_sb_field_ops[type]
-               : &bch2_sb_field_null_ops;
-}
-
-static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                 enum bch_validate_flags flags, struct printbuf *err)
-{
-       unsigned type = le32_to_cpu(f->type);
-       struct printbuf field_err = PRINTBUF;
-       const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
-       int ret;
-
-       ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0;
-       if (ret) {
-               prt_printf(err, "Invalid superblock section %s: %s",
-                          bch2_sb_fields[type], field_err.buf);
-               prt_newline(err);
-               bch2_sb_field_to_text(err, sb, f);
-       }
-
-       printbuf_exit(&field_err);
-       return ret;
-}
-
-void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
-                            struct bch_sb_field *f)
-{
-       unsigned type = le32_to_cpu(f->type);
-       const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
-
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 32);
-
-       if (ops->to_text)
-               ops->to_text(out, sb, f);
-}
-
-void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
-                          struct bch_sb_field *f)
-{
-       unsigned type = le32_to_cpu(f->type);
-
-       if (type < BCH_SB_FIELD_NR)
-               prt_printf(out, "%s", bch2_sb_fields[type]);
-       else
-               prt_printf(out, "(unknown field %u)", type);
-
-       prt_printf(out, " (size %zu):", vstruct_bytes(f));
-       prt_newline(out);
-
-       __bch2_sb_field_to_text(out, sb, f);
-}
-
-void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
-{
-       unsigned i;
-
-       prt_printf(out, "Type:                    %u", l->layout_type);
-       prt_newline(out);
-
-       prt_str(out, "Superblock max size:     ");
-       prt_units_u64(out, 512 << l->sb_max_size_bits);
-       prt_newline(out);
-
-       prt_printf(out, "Nr superblocks:          %u", l->nr_superblocks);
-       prt_newline(out);
-
-       prt_str(out, "Offsets:                 ");
-       for (i = 0; i < l->nr_superblocks; i++) {
-               if (i)
-                       prt_str(out, ", ");
-               prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
-       }
-       prt_newline(out);
-}
-
-void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
-                    bool print_layout, unsigned fields)
-{
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 44);
-
-       prt_printf(out, "External UUID:\t");
-       pr_uuid(out, sb->user_uuid.b);
-       prt_newline(out);
-
-       prt_printf(out, "Internal UUID:\t");
-       pr_uuid(out, sb->uuid.b);
-       prt_newline(out);
-
-       prt_printf(out, "Magic number:\t");
-       pr_uuid(out, sb->magic.b);
-       prt_newline(out);
-
-       prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
-
-       prt_printf(out, "Label:\t");
-       if (!strlen(sb->label))
-               prt_printf(out, "(none)");
-       else
-               prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
-       prt_newline(out);
-
-       prt_printf(out, "Version:\t");
-       bch2_version_to_text(out, le16_to_cpu(sb->version));
-       prt_newline(out);
-
-       prt_printf(out, "Incompatible features allowed:\t");
-       bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
-       prt_newline(out);
-
-       prt_printf(out, "Incompatible features in use:\t");
-       bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
-       prt_newline(out);
-
-       prt_printf(out, "Version upgrade complete:\t");
-       bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
-       prt_newline(out);
-
-       prt_printf(out, "Oldest version on disk:\t");
-       bch2_version_to_text(out, le16_to_cpu(sb->version_min));
-       prt_newline(out);
-
-       prt_printf(out, "Created:\t");
-       if (sb->time_base_lo)
-               bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
-       else
-               prt_printf(out, "(not set)");
-       prt_newline(out);
-
-       prt_printf(out, "Sequence number:\t");
-       prt_printf(out, "%llu", le64_to_cpu(sb->seq));
-       prt_newline(out);
-
-       prt_printf(out, "Time of last write:\t");
-       bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
-       prt_newline(out);
-
-       prt_printf(out, "Superblock size:\t");
-       prt_units_u64(out, vstruct_bytes(sb));
-       prt_str(out, "/");
-       prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
-       prt_newline(out);
-
-       prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
-       prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
-
-       prt_printf(out, "Sections:\t");
-       u64 fields_have = 0;
-       vstruct_for_each(sb, f)
-               fields_have |= 1 << le32_to_cpu(f->type);
-       prt_bitflags(out, bch2_sb_fields, fields_have);
-       prt_newline(out);
-
-       prt_printf(out, "Features:\t");
-       prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
-       prt_newline(out);
-
-       prt_printf(out, "Compat features:\t");
-       prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
-       prt_newline(out);
-
-       prt_newline(out);
-       prt_printf(out, "Options:");
-       prt_newline(out);
-       printbuf_indent_add(out, 2);
-       {
-               enum bch_opt_id id;
-
-               for (id = 0; id < bch2_opts_nr; id++) {
-                       const struct bch_option *opt = bch2_opt_table + id;
-
-                       if (opt->get_sb) {
-                               u64 v = bch2_opt_from_sb(sb, id, -1);
-
-                               prt_printf(out, "%s:\t", opt->attr.name);
-                               bch2_opt_to_text(out, NULL, sb, opt, v,
-                                                OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
-                               prt_newline(out);
-                       }
-               }
-       }
-
-       printbuf_indent_sub(out, 2);
-
-       if (print_layout) {
-               prt_newline(out);
-               prt_printf(out, "layout:");
-               prt_newline(out);
-               printbuf_indent_add(out, 2);
-               bch2_sb_layout_to_text(out, &sb->layout);
-               printbuf_indent_sub(out, 2);
-       }
-
-       vstruct_for_each(sb, f)
-               if (fields & (1 << le32_to_cpu(f->type))) {
-                       prt_newline(out);
-                       bch2_sb_field_to_text(out, sb, f);
-               }
-}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
deleted file mode 100644 (file)
index a3b7a90..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_IO_H
-#define _BCACHEFS_SUPER_IO_H
-
-#include "extents.h"
-#include "eytzinger.h"
-#include "super_types.h"
-#include "super.h"
-#include "sb-members.h"
-
-#include <asm/byteorder.h>
-
-#define BCH_SB_READ_SCRATCH_BUF_SIZE           4096
-
-static inline bool bch2_version_compatible(u16 version)
-{
-       return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
-               version >= bcachefs_metadata_version_min;
-}
-
-void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
-enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
-
-int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
-
-static inline int bch2_request_incompat_feature(struct bch_fs *c,
-                                               enum bcachefs_metadata_version version)
-{
-       return likely(version <= c->sb.version_incompat)
-               ? 0
-               : bch2_set_version_incompat(c, version);
-}
-
-static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
-{
-       return le32_to_cpu(f->u64s) * sizeof(u64);
-}
-
-#define field_to_type(_f, _name)                                       \
-       container_of_or_null(_f, struct bch_sb_field_##_name, field)
-
-struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
-#define bch2_sb_field_get(_sb, _name)                                  \
-       field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
-
-struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
-                                            enum bch_sb_field_type, unsigned);
-#define bch2_sb_field_resize(_sb, _name, _u64s)                                \
-       field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
-
-struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *,
-                                       enum bch_sb_field_type, unsigned);
-#define bch2_sb_field_get_minsize(_sb, _name, _u64s)                           \
-       field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
-
-#define bch2_sb_field_nr_entries(_f)                                   \
-       (_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) /        \
-              sizeof(_f->entries[0]))                                  \
-           : 0)
-
-void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
-
-extern const char * const bch2_sb_fields[];
-
-struct bch_sb_field_ops {
-       int     (*validate)(struct bch_sb *, struct bch_sb_field *,
-                           enum bch_validate_flags, struct printbuf *);
-       void    (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
-};
-
-static inline __le64 bch2_sb_magic(struct bch_fs *c)
-{
-       __le64 ret;
-
-       memcpy(&ret, &c->sb.uuid, sizeof(ret));
-       return ret;
-}
-
-static inline __u64 jset_magic(struct bch_fs *c)
-{
-       return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
-}
-
-static inline __u64 bset_magic(struct bch_fs *c)
-{
-       return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
-}
-
-int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
-int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
-
-void bch2_free_super(struct bch_sb_handle *);
-int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
-
-int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
-
-int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
-int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
-int bch2_write_super(struct bch_fs *);
-void __bch2_check_set_feature(struct bch_fs *, unsigned);
-
-static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
-{
-       if (!(c->sb.features & (1ULL << feat)))
-               __bch2_check_set_feature(c, feat);
-}
-
-bool bch2_check_version_downgrade(struct bch_fs *);
-void bch2_sb_upgrade(struct bch_fs *, unsigned, bool);
-void bch2_sb_upgrade_incompat(struct bch_fs *);
-
-void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
-                            struct bch_sb_field *);
-void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
-                          struct bch_sb_field *);
-void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
-void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
-
-#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
deleted file mode 100644 (file)
index f2417d2..0000000
+++ /dev/null
@@ -1,2547 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcachefs setup/teardown code, and some metadata io - read a superblock and
- * figure out what to do with it.
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "backpointers.h"
-#include "bkey_sort.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_write_buffer.h"
-#include "buckets_waiting_for_journal.h"
-#include "chardev.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "move.h"
-#include "migrate.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "sb-counters.h"
-#include "sb-errors.h"
-#include "sb-members.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "sysfs.h"
-#include "thread_with_file.h"
-#include "trace.h"
-
-#include <linux/backing-dev.h>
-#include <linux/blkdev.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/idr.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/random.h>
-#include <linux/sysfs.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
-MODULE_DESCRIPTION("bcachefs filesystem");
-
-typedef DARRAY(struct bch_sb_handle) bch_sb_handles;
-
-#define x(n)           #n,
-const char * const bch2_fs_flag_strs[] = {
-       BCH_FS_FLAGS()
-       NULL
-};
-
-const char * const bch2_write_refs[] = {
-       BCH_WRITE_REFS()
-       NULL
-};
-
-const char * const bch2_dev_read_refs[] = {
-       BCH_DEV_READ_REFS()
-       NULL
-};
-
-const char * const bch2_dev_write_refs[] = {
-       BCH_DEV_WRITE_REFS()
-       NULL
-};
-#undef x
-
-static void __bch2_print_str(struct bch_fs *c, const char *prefix,
-                            const char *str)
-{
-#ifdef __KERNEL__
-       struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
-
-       if (unlikely(stdio)) {
-               bch2_stdio_redirect_printf(stdio, true, "%s", str);
-               return;
-       }
-#endif
-       bch2_print_string_as_lines(KERN_ERR, str);
-}
-
-void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str)
-{
-       __bch2_print_str(c, prefix, str);
-}
-
-__printf(2, 0)
-static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
-{
-#ifdef __KERNEL__
-       if (unlikely(stdio)) {
-               if (fmt[0] == KERN_SOH[0])
-                       fmt += 2;
-
-               bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
-               return;
-       }
-#endif
-       vprintk(fmt, args);
-}
-
-void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
-{
-       struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
-
-       va_list args;
-       va_start(args, fmt);
-       bch2_print_maybe_redirect(stdio, fmt, args);
-       va_end(args);
-}
-
-void __bch2_print(struct bch_fs *c, const char *fmt, ...)
-{
-       struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
-
-       va_list args;
-       va_start(args, fmt);
-       bch2_print_maybe_redirect(stdio, fmt, args);
-       va_end(args);
-}
-
-#define KTYPE(type)                                                    \
-static const struct attribute_group type ## _group = {                 \
-       .attrs = type ## _files                                         \
-};                                                                     \
-                                                                       \
-static const struct attribute_group *type ## _groups[] = {             \
-       &type ## _group,                                                \
-       NULL                                                            \
-};                                                                     \
-                                                                       \
-static const struct kobj_type type ## _ktype = {                       \
-       .release        = type ## _release,                             \
-       .sysfs_ops      = &type ## _sysfs_ops,                          \
-       .default_groups = type ## _groups                               \
-}
-
-static void bch2_fs_release(struct kobject *);
-static void bch2_dev_release(struct kobject *);
-static void bch2_fs_counters_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_internal_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_opts_dir_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_time_stats_release(struct kobject *k)
-{
-}
-
-KTYPE(bch2_fs);
-KTYPE(bch2_fs_counters);
-KTYPE(bch2_fs_internal);
-KTYPE(bch2_fs_opts_dir);
-KTYPE(bch2_fs_time_stats);
-KTYPE(bch2_dev);
-
-static struct kset *bcachefs_kset;
-static LIST_HEAD(bch_fs_list);
-static DEFINE_MUTEX(bch_fs_list_lock);
-
-DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
-
-static void bch2_dev_unlink(struct bch_dev *);
-static void bch2_dev_free(struct bch_dev *);
-static int bch2_dev_alloc(struct bch_fs *, unsigned);
-static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
-static void bch2_dev_io_ref_stop(struct bch_dev *, int);
-static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
-
-struct bch_fs *bch2_dev_to_fs(dev_t dev)
-{
-       guard(mutex)(&bch_fs_list_lock);
-       guard(rcu)();
-
-       struct bch_fs *c;
-       list_for_each_entry(c, &bch_fs_list, list)
-               for_each_member_device_rcu(c, ca, NULL)
-                       if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
-                               closure_get(&c->cl);
-                               return c;
-                       }
-       return NULL;
-}
-
-static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
-{
-       struct bch_fs *c;
-
-       lockdep_assert_held(&bch_fs_list_lock);
-
-       list_for_each_entry(c, &bch_fs_list, list)
-               if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
-                       return c;
-
-       return NULL;
-}
-
-struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
-{
-       struct bch_fs *c;
-
-       mutex_lock(&bch_fs_list_lock);
-       c = __bch2_uuid_to_fs(uuid);
-       if (c)
-               closure_get(&c->cl);
-       mutex_unlock(&bch_fs_list_lock);
-
-       return c;
-}
-
-/* Filesystem RO/RW: */
-
-/*
- * For startup/shutdown of RW stuff, the dependencies are:
- *
- * - foreground writes depend on copygc and rebalance (to free up space)
- *
- * - copygc and rebalance depend on mark and sweep gc (they actually probably
- *   don't because they either reserve ahead of time or don't block if
- *   allocations fail, but allocations can require mark and sweep gc to run
- *   because of generation number wraparound)
- *
- * - all of the above depends on the allocator threads
- *
- * - allocator depends on the journal (when it rewrites prios and gens)
- */
-
-static void __bch2_fs_read_only(struct bch_fs *c)
-{
-       unsigned clean_passes = 0;
-       u64 seq = 0;
-
-       bch2_fs_ec_stop(c);
-       bch2_open_buckets_stop(c, NULL, true);
-       bch2_rebalance_stop(c);
-       bch2_copygc_stop(c);
-       bch2_fs_ec_flush(c);
-
-       bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
-                   journal_cur_seq(&c->journal));
-
-       do {
-               clean_passes++;
-
-               if (bch2_btree_interior_updates_flush(c) ||
-                   bch2_btree_write_buffer_flush_going_ro(c) ||
-                   bch2_journal_flush_all_pins(&c->journal) ||
-                   bch2_btree_flush_all_writes(c) ||
-                   seq != atomic64_read(&c->journal.seq)) {
-                       seq = atomic64_read(&c->journal.seq);
-                       clean_passes = 0;
-               }
-       } while (clean_passes < 2);
-
-       bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
-                   journal_cur_seq(&c->journal));
-
-       if (test_bit(JOURNAL_replay_done, &c->journal.flags) &&
-           !test_bit(BCH_FS_emergency_ro, &c->flags))
-               set_bit(BCH_FS_clean_shutdown, &c->flags);
-
-       bch2_fs_journal_stop(&c->journal);
-
-       bch_info(c, "%sclean shutdown complete, journal seq %llu",
-                test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
-                c->journal.seq_ondisk);
-
-       /*
-        * After stopping journal:
-        */
-       for_each_member_device(c, ca) {
-               bch2_dev_io_ref_stop(ca, WRITE);
-               bch2_dev_allocator_remove(c, ca);
-       }
-}
-
-static void bch2_writes_disabled(struct enumerated_ref *writes)
-{
-       struct bch_fs *c = container_of(writes, struct bch_fs, writes);
-
-       set_bit(BCH_FS_write_disable_complete, &c->flags);
-       wake_up(&bch2_read_only_wait);
-}
-
-void bch2_fs_read_only(struct bch_fs *c)
-{
-       if (!test_bit(BCH_FS_rw, &c->flags)) {
-               bch2_journal_reclaim_stop(&c->journal);
-               return;
-       }
-
-       BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags));
-
-       bch_verbose(c, "going read-only");
-
-       /*
-        * Block new foreground-end write operations from starting - any new
-        * writes will return -EROFS:
-        */
-       set_bit(BCH_FS_going_ro, &c->flags);
-       enumerated_ref_stop_async(&c->writes);
-
-       /*
-        * If we're not doing an emergency shutdown, we want to wait on
-        * outstanding writes to complete so they don't see spurious errors due
-        * to shutting down the allocator:
-        *
-        * If we are doing an emergency shutdown outstanding writes may
-        * hang until we shutdown the allocator so we don't want to wait
-        * on outstanding writes before shutting everything down - but
-        * we do need to wait on them before returning and signalling
-        * that going RO is complete:
-        */
-       wait_event(bch2_read_only_wait,
-                  test_bit(BCH_FS_write_disable_complete, &c->flags) ||
-                  test_bit(BCH_FS_emergency_ro, &c->flags));
-
-       bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags);
-       if (writes_disabled)
-               bch_verbose(c, "finished waiting for writes to stop");
-
-       __bch2_fs_read_only(c);
-
-       wait_event(bch2_read_only_wait,
-                  test_bit(BCH_FS_write_disable_complete, &c->flags));
-
-       if (!writes_disabled)
-               bch_verbose(c, "finished waiting for writes to stop");
-
-       clear_bit(BCH_FS_write_disable_complete, &c->flags);
-       clear_bit(BCH_FS_going_ro, &c->flags);
-       clear_bit(BCH_FS_rw, &c->flags);
-
-       if (!bch2_journal_error(&c->journal) &&
-           !test_bit(BCH_FS_error, &c->flags) &&
-           !test_bit(BCH_FS_emergency_ro, &c->flags) &&
-           test_bit(BCH_FS_started, &c->flags) &&
-           test_bit(BCH_FS_clean_shutdown, &c->flags) &&
-           c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) {
-               BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
-               BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
-               BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
-               BUG_ON(c->btree_write_buffer.inc.keys.nr);
-               BUG_ON(c->btree_write_buffer.flushing.keys.nr);
-               bch2_verify_accounting_clean(c);
-
-               bch_verbose(c, "marking filesystem clean");
-               bch2_fs_mark_clean(c);
-       } else {
-               /* Make sure error counts/counters are persisted */
-               mutex_lock(&c->sb_lock);
-               bch2_write_super(c);
-               mutex_unlock(&c->sb_lock);
-
-               bch_verbose(c, "done going read-only, filesystem not clean");
-       }
-}
-
-static void bch2_fs_read_only_work(struct work_struct *work)
-{
-       struct bch_fs *c =
-               container_of(work, struct bch_fs, read_only_work);
-
-       down_write(&c->state_lock);
-       bch2_fs_read_only(c);
-       up_write(&c->state_lock);
-}
-
-static void bch2_fs_read_only_async(struct bch_fs *c)
-{
-       queue_work(system_long_wq, &c->read_only_work);
-}
-
-bool bch2_fs_emergency_read_only(struct bch_fs *c)
-{
-       bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
-       bch2_journal_halt(&c->journal);
-       bch2_fs_read_only_async(c);
-
-       wake_up(&bch2_read_only_wait);
-       return ret;
-}
-
-static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out,
-                                          bool locked)
-{
-       bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
-       if (!locked)
-               bch2_journal_halt(&c->journal);
-       else
-               bch2_journal_halt_locked(&c->journal);
-       bch2_fs_read_only_async(c);
-       wake_up(&bch2_read_only_wait);
-
-       if (ret)
-               prt_printf(out, "emergency read only at seq %llu\n",
-                          journal_cur_seq(&c->journal));
-
-       return ret;
-}
-
-bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out)
-{
-       return __bch2_fs_emergency_read_only2(c, out, false);
-}
-
-bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
-{
-       bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
-       bch2_journal_halt_locked(&c->journal);
-       bch2_fs_read_only_async(c);
-
-       wake_up(&bch2_read_only_wait);
-       return ret;
-}
-
-static int __bch2_fs_read_write(struct bch_fs *c, bool early)
-{
-       int ret;
-
-       BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
-
-       if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))
-               return bch_err_throw(c, erofs_no_alloc_info);
-
-       if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
-               bch_err(c, "cannot go rw, unfixed btree errors");
-               return bch_err_throw(c, erofs_unfixed_errors);
-       }
-
-       if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
-               bch_err(c, "cannot go rw, filesystem is an unresized image file");
-               return bch_err_throw(c, erofs_filesystem_full);
-       }
-
-       if (test_bit(BCH_FS_rw, &c->flags))
-               return 0;
-
-       bch_info(c, "going read-write");
-
-       ret = bch2_fs_init_rw(c);
-       if (ret)
-               goto err;
-
-       ret = bch2_sb_members_v2_init(c);
-       if (ret)
-               goto err;
-
-       clear_bit(BCH_FS_clean_shutdown, &c->flags);
-
-       scoped_guard(rcu)
-               for_each_online_member_rcu(c, ca)
-                       if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-                               bch2_dev_allocator_add(c, ca);
-                               enumerated_ref_start(&ca->io_ref[WRITE]);
-                       }
-
-       bch2_recalc_capacity(c);
-
-       /*
-        * First journal write must be a flush write: after a clean shutdown we
-        * don't read the journal, so the first journal write may end up
-        * overwriting whatever was there previously, and there must always be
-        * at least one non-flush write in the journal or recovery will fail:
-        */
-       spin_lock(&c->journal.lock);
-       set_bit(JOURNAL_need_flush_write, &c->journal.flags);
-       set_bit(JOURNAL_running, &c->journal.flags);
-       bch2_journal_space_available(&c->journal);
-       spin_unlock(&c->journal.lock);
-
-       ret = bch2_fs_mark_dirty(c);
-       if (ret)
-               goto err;
-
-       ret = bch2_journal_reclaim_start(&c->journal);
-       if (ret)
-               goto err;
-
-       set_bit(BCH_FS_rw, &c->flags);
-       set_bit(BCH_FS_was_rw, &c->flags);
-
-       enumerated_ref_start(&c->writes);
-
-       ret = bch2_copygc_start(c);
-       if (ret) {
-               bch_err_msg(c, ret, "error starting copygc thread");
-               goto err;
-       }
-
-       ret = bch2_rebalance_start(c);
-       if (ret) {
-               bch_err_msg(c, ret, "error starting rebalance thread");
-               goto err;
-       }
-
-       bch2_do_discards(c);
-       bch2_do_invalidates(c);
-       bch2_do_stripe_deletes(c);
-       bch2_do_pending_node_rewrites(c);
-       return 0;
-err:
-       if (test_bit(BCH_FS_rw, &c->flags))
-               bch2_fs_read_only(c);
-       else
-               __bch2_fs_read_only(c);
-       return ret;
-}
-
-int bch2_fs_read_write(struct bch_fs *c)
-{
-       if (c->opts.recovery_pass_last &&
-           c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
-               return bch_err_throw(c, erofs_norecovery);
-
-       if (c->opts.nochanges)
-               return bch_err_throw(c, erofs_nochanges);
-
-       if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
-               return bch_err_throw(c, erofs_no_alloc_info);
-
-       return __bch2_fs_read_write(c, false);
-}
-
-int bch2_fs_read_write_early(struct bch_fs *c)
-{
-       down_write(&c->state_lock);
-       int ret = __bch2_fs_read_write(c, true);
-       up_write(&c->state_lock);
-
-       return ret;
-}
-
-/* Filesystem startup/shutdown: */
-
-static void __bch2_fs_free(struct bch_fs *c)
-{
-       for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
-               bch2_time_stats_exit(&c->times[i]);
-
-#ifdef CONFIG_UNICODE
-       utf8_unload(c->cf_encoding);
-#endif
-
-       bch2_find_btree_nodes_exit(&c->found_btree_nodes);
-       bch2_free_pending_node_rewrites(c);
-       bch2_free_fsck_errs(c);
-       bch2_fs_vfs_exit(c);
-       bch2_fs_snapshots_exit(c);
-       bch2_fs_sb_errors_exit(c);
-       bch2_fs_replicas_exit(c);
-       bch2_fs_rebalance_exit(c);
-       bch2_fs_quota_exit(c);
-       bch2_fs_nocow_locking_exit(c);
-       bch2_fs_journal_exit(&c->journal);
-       bch2_fs_fs_io_direct_exit(c);
-       bch2_fs_fs_io_buffered_exit(c);
-       bch2_fs_fsio_exit(c);
-       bch2_fs_io_write_exit(c);
-       bch2_fs_io_read_exit(c);
-       bch2_fs_encryption_exit(c);
-       bch2_fs_ec_exit(c);
-       bch2_fs_counters_exit(c);
-       bch2_fs_compress_exit(c);
-       bch2_io_clock_exit(&c->io_clock[WRITE]);
-       bch2_io_clock_exit(&c->io_clock[READ]);
-       bch2_fs_buckets_waiting_for_journal_exit(c);
-       bch2_fs_btree_write_buffer_exit(c);
-       bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
-       bch2_fs_btree_iter_exit(c);
-       bch2_fs_btree_interior_update_exit(c);
-       bch2_fs_btree_cache_exit(c);
-       bch2_fs_accounting_exit(c);
-       bch2_fs_async_obj_exit(c);
-       bch2_journal_keys_put_initial(c);
-       bch2_find_btree_nodes_exit(&c->found_btree_nodes);
-
-       BUG_ON(atomic_read(&c->journal_keys.ref));
-       percpu_free_rwsem(&c->mark_lock);
-       if (c->online_reserved) {
-               u64 v = percpu_u64_get(c->online_reserved);
-               WARN(v, "online_reserved not 0 at shutdown: %lli", v);
-               free_percpu(c->online_reserved);
-       }
-
-       darray_exit(&c->incompat_versions_requested);
-       darray_exit(&c->btree_roots_extra);
-       free_percpu(c->pcpu);
-       free_percpu(c->usage);
-       mempool_exit(&c->large_bkey_pool);
-       mempool_exit(&c->btree_bounce_pool);
-       bioset_exit(&c->btree_bio);
-       mempool_exit(&c->fill_iter);
-       enumerated_ref_exit(&c->writes);
-       kfree(rcu_dereference_protected(c->disk_groups, 1));
-       kfree(c->journal_seq_blacklist_table);
-
-       if (c->write_ref_wq)
-               destroy_workqueue(c->write_ref_wq);
-       if (c->btree_write_submit_wq)
-               destroy_workqueue(c->btree_write_submit_wq);
-       if (c->btree_read_complete_wq)
-               destroy_workqueue(c->btree_read_complete_wq);
-       if (c->copygc_wq)
-               destroy_workqueue(c->copygc_wq);
-       if (c->btree_write_complete_wq)
-               destroy_workqueue(c->btree_write_complete_wq);
-       if (c->btree_update_wq)
-               destroy_workqueue(c->btree_update_wq);
-
-       bch2_free_super(&c->disk_sb);
-       kvfree(c);
-       module_put(THIS_MODULE);
-}
-
-static void bch2_fs_release(struct kobject *kobj)
-{
-       struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
-       __bch2_fs_free(c);
-}
-
-void __bch2_fs_stop(struct bch_fs *c)
-{
-       bch_verbose(c, "shutting down");
-
-       set_bit(BCH_FS_stopping, &c->flags);
-
-       down_write(&c->state_lock);
-       bch2_fs_read_only(c);
-       up_write(&c->state_lock);
-
-       for (unsigned i = 0; i < c->sb.nr_devices; i++) {
-               struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
-               if (ca)
-                       bch2_dev_io_ref_stop(ca, READ);
-       }
-
-       for_each_member_device(c, ca)
-               bch2_dev_unlink(ca);
-
-       if (c->kobj.state_in_sysfs)
-               kobject_del(&c->kobj);
-
-       bch2_fs_debug_exit(c);
-       bch2_fs_chardev_exit(c);
-
-       bch2_ro_ref_put(c);
-       wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref));
-
-       kobject_put(&c->counters_kobj);
-       kobject_put(&c->time_stats);
-       kobject_put(&c->opts_dir);
-       kobject_put(&c->internal);
-
-       /* btree prefetch might have kicked off reads in the background: */
-       bch2_btree_flush_all_reads(c);
-
-       for_each_member_device(c, ca)
-               cancel_work_sync(&ca->io_error_work);
-
-       cancel_work_sync(&c->read_only_work);
-}
-
-void bch2_fs_free(struct bch_fs *c)
-{
-       mutex_lock(&bch_fs_list_lock);
-       list_del(&c->list);
-       mutex_unlock(&bch_fs_list_lock);
-
-       closure_sync(&c->cl);
-       closure_debug_destroy(&c->cl);
-
-       for (unsigned i = 0; i < c->sb.nr_devices; i++) {
-               struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
-
-               if (ca) {
-                       EBUG_ON(atomic_long_read(&ca->ref) != 1);
-                       bch2_dev_io_ref_stop(ca, READ);
-                       bch2_free_super(&ca->disk_sb);
-                       bch2_dev_free(ca);
-               }
-       }
-
-       bch_verbose(c, "shutdown complete");
-
-       kobject_put(&c->kobj);
-}
-
-void bch2_fs_stop(struct bch_fs *c)
-{
-       __bch2_fs_stop(c);
-       bch2_fs_free(c);
-}
-
-static int bch2_fs_online(struct bch_fs *c)
-{
-       int ret = 0;
-
-       lockdep_assert_held(&bch_fs_list_lock);
-
-       if (c->sb.multi_device &&
-           __bch2_uuid_to_fs(c->sb.uuid)) {
-               bch_err(c, "filesystem UUID already open");
-               return bch_err_throw(c, filesystem_uuid_already_open);
-       }
-
-       ret = bch2_fs_chardev_init(c);
-       if (ret) {
-               bch_err(c, "error creating character device");
-               return ret;
-       }
-
-       bch2_fs_debug_init(c);
-
-       ret = (c->sb.multi_device
-              ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b)
-              : kobject_add(&c->kobj, NULL, "%s", c->name)) ?:
-           kobject_add(&c->internal, &c->kobj, "internal") ?:
-           kobject_add(&c->opts_dir, &c->kobj, "options") ?:
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-           kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
-#endif
-           kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
-           bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS);
-       if (ret) {
-               bch_err(c, "error creating sysfs objects");
-               return ret;
-       }
-
-       down_write(&c->state_lock);
-
-       for_each_member_device(c, ca) {
-               ret = bch2_dev_sysfs_online(c, ca);
-               if (ret) {
-                       bch_err(c, "error creating sysfs objects");
-                       bch2_dev_put(ca);
-                       goto err;
-               }
-       }
-
-       BUG_ON(!list_empty(&c->list));
-       list_add(&c->list, &bch_fs_list);
-err:
-       up_write(&c->state_lock);
-       return ret;
-}
-
-int bch2_fs_init_rw(struct bch_fs *c)
-{
-       if (test_bit(BCH_FS_rw_init_done, &c->flags))
-               return 0;
-
-       if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
-                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
-           !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete",
-                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 1)) ||
-           !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
-                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_PERCPU, 1)) ||
-           !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
-                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 1)) ||
-           !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
-                               WQ_FREEZABLE|WQ_PERCPU, 0)))
-               return bch_err_throw(c, ENOMEM_fs_other_alloc);
-
-       int ret = bch2_fs_btree_interior_update_init(c) ?:
-               bch2_fs_btree_write_buffer_init(c) ?:
-               bch2_fs_fs_io_buffered_init(c) ?:
-               bch2_fs_io_write_init(c) ?:
-               bch2_fs_journal_init(&c->journal);
-       if (ret)
-               return ret;
-
-       set_bit(BCH_FS_rw_init_done, &c->flags);
-       return 0;
-}
-
-static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
-                                   bch_sb_handles *sbs)
-{
-       struct bch_fs *c;
-       struct printbuf name = PRINTBUF;
-       unsigned i, iter_size;
-       int ret = 0;
-
-       c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
-       if (!c) {
-               c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
-               goto out;
-       }
-
-       c->stdio = (void *)(unsigned long) opts->stdio;
-
-       __module_get(THIS_MODULE);
-
-       closure_init(&c->cl, NULL);
-
-       c->kobj.kset = bcachefs_kset;
-       kobject_init(&c->kobj, &bch2_fs_ktype);
-       kobject_init(&c->internal, &bch2_fs_internal_ktype);
-       kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
-       kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
-       kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
-
-       c->minor                = -1;
-       c->disk_sb.fs_sb        = true;
-
-       init_rwsem(&c->state_lock);
-       mutex_init(&c->sb_lock);
-       mutex_init(&c->replicas_gc_lock);
-       mutex_init(&c->btree_root_lock);
-       INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
-
-       refcount_set(&c->ro_ref, 1);
-       init_waitqueue_head(&c->ro_ref_wait);
-
-       for (i = 0; i < BCH_TIME_STAT_NR; i++)
-               bch2_time_stats_init(&c->times[i]);
-
-       bch2_fs_allocator_background_init(c);
-       bch2_fs_allocator_foreground_init(c);
-       bch2_fs_btree_cache_init_early(&c->btree_cache);
-       bch2_fs_btree_gc_init_early(c);
-       bch2_fs_btree_interior_update_init_early(c);
-       bch2_fs_btree_iter_init_early(c);
-       bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
-       bch2_fs_btree_write_buffer_init_early(c);
-       bch2_fs_copygc_init(c);
-       bch2_fs_ec_init_early(c);
-       bch2_fs_journal_init_early(&c->journal);
-       bch2_fs_journal_keys_init(c);
-       bch2_fs_move_init(c);
-       bch2_fs_nocow_locking_init_early(c);
-       bch2_fs_quota_init(c);
-       bch2_fs_recovery_passes_init(c);
-       bch2_fs_sb_errors_init_early(c);
-       bch2_fs_snapshots_init_early(c);
-       bch2_fs_subvolumes_init_early(c);
-
-       INIT_LIST_HEAD(&c->list);
-
-       mutex_init(&c->bio_bounce_pages_lock);
-       mutex_init(&c->snapshot_table_lock);
-       init_rwsem(&c->snapshot_create_lock);
-
-       spin_lock_init(&c->btree_write_error_lock);
-
-       INIT_LIST_HEAD(&c->journal_iters);
-
-       INIT_LIST_HEAD(&c->fsck_error_msgs);
-       mutex_init(&c->fsck_error_msgs_lock);
-
-       seqcount_init(&c->usage_lock);
-
-       sema_init(&c->io_in_flight, 128);
-
-       INIT_LIST_HEAD(&c->vfs_inodes_list);
-       mutex_init(&c->vfs_inodes_lock);
-
-       c->journal.flush_write_time     = &c->times[BCH_TIME_journal_flush_write];
-       c->journal.noflush_write_time   = &c->times[BCH_TIME_journal_noflush_write];
-       c->journal.flush_seq_time       = &c->times[BCH_TIME_journal_flush_seq];
-
-       mutex_init(&c->sectors_available_lock);
-
-       ret = percpu_init_rwsem(&c->mark_lock);
-       if (ret)
-               goto err;
-
-       mutex_lock(&c->sb_lock);
-       ret = bch2_sb_to_fs(c, sb);
-       mutex_unlock(&c->sb_lock);
-
-       if (ret)
-               goto err;
-
-       /* Compat: */
-       if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
-           !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
-               SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
-
-       if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
-           !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
-               SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
-
-       c->opts = bch2_opts_default;
-       ret = bch2_opts_from_sb(&c->opts, sb);
-       if (ret)
-               goto err;
-
-       bch2_opts_apply(&c->opts, *opts);
-
-       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
-           c->opts.block_size > PAGE_SIZE) {
-               bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE");
-               ret = -EINVAL;
-               goto err;
-       }
-
-       c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
-       if (c->opts.inodes_use_key_cache)
-               c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
-       c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
-
-       c->block_bits           = ilog2(block_sectors(c));
-       c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
-
-       if (bch2_fs_init_fault("fs_alloc")) {
-               bch_err(c, "fs_alloc fault injected");
-               ret = -EFAULT;
-               goto err;
-       }
-
-       if (c->sb.multi_device)
-               pr_uuid(&name, c->sb.user_uuid.b);
-       else
-               prt_bdevname(&name, sbs->data[0].bdev);
-
-       ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
-       if (ret)
-               goto err;
-
-       strscpy(c->name, name.buf, sizeof(c->name));
-       printbuf_exit(&name);
-
-       iter_size = sizeof(struct sort_iter) +
-               (btree_blocks(c) + 1) * 2 *
-               sizeof(struct sort_iter_set);
-
-       if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
-                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 512)) ||
-           enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR,
-                               bch2_writes_disabled) ||
-           mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
-           bioset_init(&c->btree_bio, 1,
-                       max(offsetof(struct btree_read_bio, bio),
-                           offsetof(struct btree_write_bio, wbio.bio)),
-                       BIOSET_NEED_BVECS) ||
-           !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
-           !(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||
-           !(c->online_reserved = alloc_percpu(u64)) ||
-           mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
-                                      c->opts.btree_node_size) ||
-           mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
-               ret = bch_err_throw(c, ENOMEM_fs_other_alloc);
-               goto err;
-       }
-
-       ret =
-           bch2_fs_async_obj_init(c) ?:
-           bch2_fs_btree_cache_init(c) ?:
-           bch2_fs_btree_iter_init(c) ?:
-           bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
-           bch2_fs_buckets_waiting_for_journal_init(c) ?:
-           bch2_io_clock_init(&c->io_clock[READ]) ?:
-           bch2_io_clock_init(&c->io_clock[WRITE]) ?:
-           bch2_fs_compress_init(c) ?:
-           bch2_fs_counters_init(c) ?:
-           bch2_fs_ec_init(c) ?:
-           bch2_fs_encryption_init(c) ?:
-           bch2_fs_fsio_init(c) ?:
-           bch2_fs_fs_io_direct_init(c) ?:
-           bch2_fs_io_read_init(c) ?:
-           bch2_fs_rebalance_init(c) ?:
-           bch2_fs_sb_errors_init(c) ?:
-           bch2_fs_vfs_init(c);
-       if (ret)
-               goto err;
-
-       if (go_rw_in_recovery(c)) {
-               /*
-                * start workqueues/kworkers early - kthread creation checks for
-                * pending signals, which is _very_ annoying
-                */
-               ret = bch2_fs_init_rw(c);
-               if (ret)
-                       goto err;
-       }
-
-#ifdef CONFIG_UNICODE
-       if (bch2_fs_casefold_enabled(c)) {
-               /* Default encoding until we can potentially have more as an option. */
-               c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
-               if (IS_ERR(c->cf_encoding)) {
-                       printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
-                              unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
-                              unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
-                              unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
-                       ret = -EINVAL;
-                       goto err;
-               }
-       }
-#else
-       if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
-               printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
-               ret = -EINVAL;
-               goto err;
-       }
-#endif
-
-       for (i = 0; i < c->sb.nr_devices; i++) {
-               if (!bch2_member_exists(c->disk_sb.sb, i))
-                       continue;
-               ret = bch2_dev_alloc(c, i);
-               if (ret)
-                       goto err;
-       }
-
-       bch2_journal_entry_res_resize(&c->journal,
-                       &c->btree_root_journal_res,
-                       BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
-       bch2_journal_entry_res_resize(&c->journal,
-                       &c->clock_journal_res,
-                       (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
-
-       mutex_lock(&bch_fs_list_lock);
-       ret = bch2_fs_online(c);
-       mutex_unlock(&bch_fs_list_lock);
-
-       if (ret)
-               goto err;
-out:
-       return c;
-err:
-       bch2_fs_free(c);
-       c = ERR_PTR(ret);
-       goto out;
-}
-
-noinline_for_stack
-static void print_mount_opts(struct bch_fs *c)
-{
-       enum bch_opt_id i;
-       CLASS(printbuf, p)();
-       bch2_log_msg_start(c, &p);
-
-       prt_str(&p, "starting version ");
-       bch2_version_to_text(&p, c->sb.version);
-
-       bool first = true;
-       for (i = 0; i < bch2_opts_nr; i++) {
-               const struct bch_option *opt = &bch2_opt_table[i];
-               u64 v = bch2_opt_get_by_id(&c->opts, i);
-
-               if (!(opt->flags & OPT_MOUNT))
-                       continue;
-
-               if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
-                       continue;
-
-               prt_str(&p, first ? " opts=" : ",");
-               first = false;
-               bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
-       }
-
-       if (c->sb.version_incompat_allowed != c->sb.version) {
-               prt_printf(&p, "\nallowing incompatible features above ");
-               bch2_version_to_text(&p, c->sb.version_incompat_allowed);
-       }
-
-       if (c->opts.verbose) {
-               prt_printf(&p, "\nfeatures: ");
-               prt_bitflags(&p, bch2_sb_features, c->sb.features);
-       }
-
-       if (c->sb.multi_device) {
-               prt_printf(&p, "\nwith devices");
-               for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) {
-                       prt_char(&p, ' ');
-                       prt_str(&p, ca->name);
-               }
-       }
-
-       bch2_print_str(c, KERN_INFO, p.buf);
-}
-
-static bool bch2_fs_may_start(struct bch_fs *c)
-{
-       struct bch_dev *ca;
-       unsigned flags = 0;
-
-       switch (c->opts.degraded) {
-       case BCH_DEGRADED_very:
-               flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
-               break;
-       case BCH_DEGRADED_yes:
-               flags |= BCH_FORCE_IF_DEGRADED;
-               break;
-       default:
-               mutex_lock(&c->sb_lock);
-               for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-                       if (!bch2_member_exists(c->disk_sb.sb, i))
-                               continue;
-
-                       ca = bch2_dev_locked(c, i);
-
-                       if (!bch2_dev_is_online(ca) &&
-                           (ca->mi.state == BCH_MEMBER_STATE_rw ||
-                            ca->mi.state == BCH_MEMBER_STATE_ro)) {
-                               mutex_unlock(&c->sb_lock);
-                               return false;
-                       }
-               }
-               mutex_unlock(&c->sb_lock);
-               break;
-       }
-
-       return bch2_have_enough_devs(c, c->online_devs, flags, true);
-}
-
-int bch2_fs_start(struct bch_fs *c)
-{
-       time64_t now = ktime_get_real_seconds();
-       int ret = 0;
-
-       print_mount_opts(c);
-
-       if (c->cf_encoding)
-               bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
-                        unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
-                        unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
-                        unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
-
-       if (!bch2_fs_may_start(c))
-               return bch_err_throw(c, insufficient_devices_to_start);
-
-       down_write(&c->state_lock);
-       mutex_lock(&c->sb_lock);
-
-       BUG_ON(test_bit(BCH_FS_started, &c->flags));
-
-       if (!bch2_sb_field_get_minsize(&c->disk_sb, ext,
-                       sizeof(struct bch_sb_field_ext) / sizeof(u64))) {
-               mutex_unlock(&c->sb_lock);
-               up_write(&c->state_lock);
-               ret = bch_err_throw(c, ENOSPC_sb);
-               goto err;
-       }
-
-       ret = bch2_sb_members_v2_init(c);
-       if (ret) {
-               mutex_unlock(&c->sb_lock);
-               up_write(&c->state_lock);
-               goto err;
-       }
-
-       scoped_guard(rcu)
-               for_each_online_member_rcu(c, ca)
-                       bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
-                       cpu_to_le64(now);
-
-       /*
-        * Dno't write superblock yet: recovery might have to downgrade
-        */
-       mutex_unlock(&c->sb_lock);
-
-       scoped_guard(rcu)
-               for_each_online_member_rcu(c, ca)
-                       if (ca->mi.state == BCH_MEMBER_STATE_rw)
-                               bch2_dev_allocator_add(c, ca);
-       bch2_recalc_capacity(c);
-       up_write(&c->state_lock);
-
-       c->recovery_task = current;
-       ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
-               ? bch2_fs_recovery(c)
-               : bch2_fs_initialize(c);
-       c->recovery_task = NULL;
-
-       if (ret)
-               goto err;
-
-       ret = bch2_opts_hooks_pre_set(c);
-       if (ret)
-               goto err;
-
-       if (bch2_fs_init_fault("fs_start")) {
-               ret = bch_err_throw(c, injected_fs_start);
-               goto err;
-       }
-
-       set_bit(BCH_FS_started, &c->flags);
-       wake_up(&c->ro_ref_wait);
-
-       down_write(&c->state_lock);
-       if (c->opts.read_only)
-               bch2_fs_read_only(c);
-       else if (!test_bit(BCH_FS_rw, &c->flags))
-               ret = bch2_fs_read_write(c);
-       up_write(&c->state_lock);
-
-err:
-       if (ret)
-               bch_err_msg(c, ret, "starting filesystem");
-       else
-               bch_verbose(c, "done starting filesystem");
-       return ret;
-}
-
-static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
-{
-       struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
-
-       if (le16_to_cpu(sb->block_size) != block_sectors(c))
-               return bch_err_throw(c, mismatched_block_size);
-
-       if (le16_to_cpu(m.bucket_size) <
-           BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
-               return bch_err_throw(c, bucket_size_too_small);
-
-       return 0;
-}
-
-static int bch2_dev_in_fs(struct bch_sb_handle *fs,
-                         struct bch_sb_handle *sb,
-                         struct bch_opts *opts)
-{
-       if (fs == sb)
-               return 0;
-
-       if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
-               return -BCH_ERR_device_not_a_member_of_filesystem;
-
-       if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))
-               return -BCH_ERR_device_has_been_removed;
-
-       if (fs->sb->block_size != sb->sb->block_size)
-               return -BCH_ERR_mismatched_block_size;
-
-       if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
-           le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
-               return 0;
-
-       if (fs->sb->seq == sb->sb->seq &&
-           fs->sb->write_time != sb->sb->write_time) {
-               struct printbuf buf = PRINTBUF;
-
-               prt_str(&buf, "Split brain detected between ");
-               prt_bdevname(&buf, sb->bdev);
-               prt_str(&buf, " and ");
-               prt_bdevname(&buf, fs->bdev);
-               prt_char(&buf, ':');
-               prt_newline(&buf);
-               prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
-               prt_newline(&buf);
-
-               prt_bdevname(&buf, fs->bdev);
-               prt_char(&buf, ' ');
-               bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
-               prt_newline(&buf);
-
-               prt_bdevname(&buf, sb->bdev);
-               prt_char(&buf, ' ');
-               bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
-               prt_newline(&buf);
-
-               if (!opts->no_splitbrain_check)
-                       prt_printf(&buf, "Not using older sb");
-
-               pr_err("%s", buf.buf);
-               printbuf_exit(&buf);
-
-               if (!opts->no_splitbrain_check)
-                       return -BCH_ERR_device_splitbrain;
-       }
-
-       struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
-       u64 seq_from_fs         = le64_to_cpu(m.seq);
-       u64 seq_from_member     = le64_to_cpu(sb->sb->seq);
-
-       if (seq_from_fs && seq_from_fs < seq_from_member) {
-               struct printbuf buf = PRINTBUF;
-
-               prt_str(&buf, "Split brain detected between ");
-               prt_bdevname(&buf, sb->bdev);
-               prt_str(&buf, " and ");
-               prt_bdevname(&buf, fs->bdev);
-               prt_char(&buf, ':');
-               prt_newline(&buf);
-
-               prt_bdevname(&buf, fs->bdev);
-               prt_str(&buf, " believes seq of ");
-               prt_bdevname(&buf, sb->bdev);
-               prt_printf(&buf, " to be %llu, but ", seq_from_fs);
-               prt_bdevname(&buf, sb->bdev);
-               prt_printf(&buf, " has %llu\n", seq_from_member);
-
-               if (!opts->no_splitbrain_check) {
-                       prt_str(&buf, "Not using ");
-                       prt_bdevname(&buf, sb->bdev);
-               }
-
-               pr_err("%s", buf.buf);
-               printbuf_exit(&buf);
-
-               if (!opts->no_splitbrain_check)
-                       return -BCH_ERR_device_splitbrain;
-       }
-
-       return 0;
-}
-
-/* Device startup/shutdown: */
-
-static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
-{
-       if (rw == READ)
-               clear_bit(ca->dev_idx, ca->fs->online_devs.d);
-
-       if (!enumerated_ref_is_zero(&ca->io_ref[rw]))
-               enumerated_ref_stop(&ca->io_ref[rw],
-                                   rw == READ
-                                   ? bch2_dev_read_refs
-                                   : bch2_dev_write_refs);
-}
-
-static void bch2_dev_release(struct kobject *kobj)
-{
-       struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
-
-       kfree(ca);
-}
-
-static void bch2_dev_free(struct bch_dev *ca)
-{
-       WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
-       WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
-
-       cancel_work_sync(&ca->io_error_work);
-
-       bch2_dev_unlink(ca);
-
-       if (ca->kobj.state_in_sysfs)
-               kobject_del(&ca->kobj);
-
-       bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
-       bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
-
-       bch2_free_super(&ca->disk_sb);
-       bch2_dev_allocator_background_exit(ca);
-       bch2_dev_journal_exit(ca);
-
-       free_percpu(ca->io_done);
-       bch2_dev_buckets_free(ca);
-       kfree(ca->sb_read_scratch);
-
-       bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
-       bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
-
-       enumerated_ref_exit(&ca->io_ref[WRITE]);
-       enumerated_ref_exit(&ca->io_ref[READ]);
-#ifndef CONFIG_BCACHEFS_DEBUG
-       percpu_ref_exit(&ca->ref);
-#endif
-       kobject_put(&ca->kobj);
-}
-
-static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
-{
-
-       lockdep_assert_held(&c->state_lock);
-
-       if (enumerated_ref_is_zero(&ca->io_ref[READ]))
-               return;
-
-       __bch2_dev_read_only(c, ca);
-
-       bch2_dev_io_ref_stop(ca, READ);
-
-       bch2_dev_unlink(ca);
-
-       bch2_free_super(&ca->disk_sb);
-       bch2_dev_journal_exit(ca);
-}
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-static void bch2_dev_ref_complete(struct percpu_ref *ref)
-{
-       struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
-
-       complete(&ca->ref_completion);
-}
-#endif
-
-static void bch2_dev_unlink(struct bch_dev *ca)
-{
-       struct kobject *b;
-
-       /*
-        * This is racy w.r.t. the underlying block device being hot-removed,
-        * which removes it from sysfs.
-        *
-        * It'd be lovely if we had a way to handle this race, but the sysfs
-        * code doesn't appear to provide a good method and block/holder.c is
-        * susceptible as well:
-        */
-       if (ca->kobj.state_in_sysfs &&
-           ca->disk_sb.bdev &&
-           (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
-               sysfs_remove_link(b, "bcachefs");
-               sysfs_remove_link(&ca->kobj, "block");
-       }
-}
-
-static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
-{
-       int ret;
-
-       if (!c->kobj.state_in_sysfs)
-               return 0;
-
-       if (!ca->kobj.state_in_sysfs) {
-               ret =   kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?:
-                       bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE);
-               if (ret)
-                       return ret;
-       }
-
-       if (ca->disk_sb.bdev) {
-               struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
-
-               ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
-               if (ret)
-                       return ret;
-
-               ret = sysfs_create_link(&ca->kobj, block, "block");
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
-                                       struct bch_member *member)
-{
-       struct bch_dev *ca;
-       unsigned i;
-
-       ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-       if (!ca)
-               return NULL;
-
-       kobject_init(&ca->kobj, &bch2_dev_ktype);
-       init_completion(&ca->ref_completion);
-
-       INIT_WORK(&ca->io_error_work, bch2_io_error_work);
-
-       bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
-       bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
-
-       ca->mi = bch2_mi_to_cpu(member);
-
-       for (i = 0; i < ARRAY_SIZE(member->errors); i++)
-               atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
-
-       ca->uuid = member->uuid;
-
-       ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
-                            ca->mi.bucket_size / btree_sectors(c));
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-       if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))
-               goto err;
-#else
-       atomic_long_set(&ca->ref, 1);
-#endif
-
-       mutex_init(&ca->bucket_backpointer_mismatch.lock);
-       mutex_init(&ca->bucket_backpointer_empty.lock);
-
-       bch2_dev_allocator_background_init(ca);
-
-       if (enumerated_ref_init(&ca->io_ref[READ],  BCH_DEV_READ_REF_NR,  NULL) ||
-           enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) ||
-           !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
-           bch2_dev_buckets_alloc(c, ca) ||
-           !(ca->io_done       = alloc_percpu(*ca->io_done)))
-               goto err;
-
-       return ca;
-err:
-       bch2_dev_free(ca);
-       return NULL;
-}
-
-static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
-                           unsigned dev_idx)
-{
-       ca->dev_idx = dev_idx;
-       __set_bit(ca->dev_idx, ca->self.d);
-
-       if (!ca->name[0])
-               scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
-
-       ca->fs = c;
-       rcu_assign_pointer(c->devs[ca->dev_idx], ca);
-
-       if (bch2_dev_sysfs_online(c, ca))
-               pr_warn("error creating sysfs objects");
-}
-
-static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
-{
-       struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
-       struct bch_dev *ca = NULL;
-
-       if (bch2_fs_init_fault("dev_alloc"))
-               goto err;
-
-       ca = __bch2_dev_alloc(c, &member);
-       if (!ca)
-               goto err;
-
-       ca->fs = c;
-
-       bch2_dev_attach(c, ca, dev_idx);
-       return 0;
-err:
-       return bch_err_throw(c, ENOMEM_dev_alloc);
-}
-
-static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
-{
-       unsigned ret;
-
-       if (bch2_dev_is_online(ca)) {
-               bch_err(ca, "already have device online in slot %u",
-                       sb->sb->dev_idx);
-               return bch_err_throw(ca->fs, device_already_online);
-       }
-
-       if (get_capacity(sb->bdev->bd_disk) <
-           ca->mi.bucket_size * ca->mi.nbuckets) {
-               bch_err(ca, "cannot online: device too small");
-               return bch_err_throw(ca->fs, device_size_too_small);
-       }
-
-       BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
-       BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
-
-       ret = bch2_dev_journal_init(ca, sb->sb);
-       if (ret)
-               return ret;
-
-       struct printbuf name = PRINTBUF;
-       prt_bdevname(&name, sb->bdev);
-       strscpy(ca->name, name.buf, sizeof(ca->name));
-       printbuf_exit(&name);
-
-       /* Commit: */
-       ca->disk_sb = *sb;
-       memset(sb, 0, sizeof(*sb));
-
-       /*
-        * Stash pointer to the filesystem for blk_holder_ops - note that once
-        * attached to a filesystem, we will always close the block device
-        * before tearing down the filesystem object.
-        */
-       ca->disk_sb.holder->c = ca->fs;
-
-       ca->dev = ca->disk_sb.bdev->bd_dev;
-
-       enumerated_ref_start(&ca->io_ref[READ]);
-
-       return 0;
-}
-
-static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
-{
-       struct bch_dev *ca;
-       int ret;
-
-       lockdep_assert_held(&c->state_lock);
-
-       if (le64_to_cpu(sb->sb->seq) >
-           le64_to_cpu(c->disk_sb.sb->seq))
-               bch2_sb_to_fs(c, sb->sb);
-
-       BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
-
-       ca = bch2_dev_locked(c, sb->sb->dev_idx);
-
-       ret = __bch2_dev_attach_bdev(ca, sb);
-       if (ret)
-               return ret;
-
-       set_bit(ca->dev_idx, c->online_devs.d);
-
-       bch2_dev_sysfs_online(c, ca);
-
-       bch2_rebalance_wakeup(c);
-       return 0;
-}
-
-/* Device management: */
-
-/*
- * Note: this function is also used by the error paths - when a particular
- * device sees an error, we call it to determine whether we can just set the
- * device RO, or - if this function returns false - we'll set the whole
- * filesystem RO:
- *
- * XXX: maybe we should be more explicit about whether we're changing state
- * because we got an error or what have you?
- */
-bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
-                           enum bch_member_state new_state, int flags)
-{
-       struct bch_devs_mask new_online_devs;
-       int nr_rw = 0, required;
-
-       lockdep_assert_held(&c->state_lock);
-
-       switch (new_state) {
-       case BCH_MEMBER_STATE_rw:
-               return true;
-       case BCH_MEMBER_STATE_ro:
-               if (ca->mi.state != BCH_MEMBER_STATE_rw)
-                       return true;
-
-               /* do we have enough devices to write to?  */
-               for_each_member_device(c, ca2)
-                       if (ca2 != ca)
-                               nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
-
-               required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
-                              ? c->opts.metadata_replicas
-                              : metadata_replicas_required(c),
-                              !(flags & BCH_FORCE_IF_DATA_DEGRADED)
-                              ? c->opts.data_replicas
-                              : data_replicas_required(c));
-
-               return nr_rw >= required;
-       case BCH_MEMBER_STATE_failed:
-       case BCH_MEMBER_STATE_spare:
-               if (ca->mi.state != BCH_MEMBER_STATE_rw &&
-                   ca->mi.state != BCH_MEMBER_STATE_ro)
-                       return true;
-
-               /* do we have enough devices to read from?  */
-               new_online_devs = c->online_devs;
-               __clear_bit(ca->dev_idx, new_online_devs.d);
-
-               return bch2_have_enough_devs(c, new_online_devs, flags, false);
-       default:
-               BUG();
-       }
-}
-
-static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
-{
-       bch2_dev_io_ref_stop(ca, WRITE);
-
-       /*
-        * The allocator thread itself allocates btree nodes, so stop it first:
-        */
-       bch2_dev_allocator_remove(c, ca);
-       bch2_recalc_capacity(c);
-       bch2_dev_journal_stop(&c->journal, ca);
-}
-
-static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
-{
-       lockdep_assert_held(&c->state_lock);
-
-       BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
-
-       bch2_dev_allocator_add(c, ca);
-       bch2_recalc_capacity(c);
-
-       if (enumerated_ref_is_zero(&ca->io_ref[WRITE]))
-               enumerated_ref_start(&ca->io_ref[WRITE]);
-
-       bch2_dev_do_discards(ca);
-}
-
-int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
-                        enum bch_member_state new_state, int flags)
-{
-       struct bch_member *m;
-       int ret = 0;
-
-       if (ca->mi.state == new_state)
-               return 0;
-
-       if (!bch2_dev_state_allowed(c, ca, new_state, flags))
-               return bch_err_throw(c, device_state_not_allowed);
-
-       if (new_state != BCH_MEMBER_STATE_rw)
-               __bch2_dev_read_only(c, ca);
-
-       bch_notice(ca, "%s", bch2_member_states[new_state]);
-
-       mutex_lock(&c->sb_lock);
-       m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-       SET_BCH_MEMBER_STATE(m, new_state);
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       if (new_state == BCH_MEMBER_STATE_rw)
-               __bch2_dev_read_write(c, ca);
-
-       bch2_rebalance_wakeup(c);
-
-       return ret;
-}
-
-int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
-                      enum bch_member_state new_state, int flags)
-{
-       int ret;
-
-       down_write(&c->state_lock);
-       ret = __bch2_dev_set_state(c, ca, new_state, flags);
-       up_write(&c->state_lock);
-
-       return ret;
-}
-
-/* Device add/removal: */
-
-int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
-       struct bch_member *m;
-       unsigned dev_idx = ca->dev_idx, data;
-       bool fast_device_removal = !bch2_request_incompat_feature(c,
-                                       bcachefs_metadata_version_fast_device_removal);
-       int ret;
-
-       down_write(&c->state_lock);
-
-       /*
-        * We consume a reference to ca->ref, regardless of whether we succeed
-        * or fail:
-        */
-       bch2_dev_put(ca);
-
-       if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
-               bch_err(ca, "Cannot remove without losing data");
-               ret = bch_err_throw(c, device_state_not_allowed);
-               goto err;
-       }
-
-       __bch2_dev_read_only(c, ca);
-
-       ret = fast_device_removal
-               ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags)
-               : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?:
-                  bch2_dev_remove_stripes(c, ca->dev_idx, flags));
-       if (ret)
-               goto err;
-
-       /* Check if device still has data before blowing away alloc info */
-       struct bch_dev_usage usage = bch2_dev_usage_read(ca);
-       for (unsigned i = 0; i < BCH_DATA_NR; i++)
-               if (!data_type_is_empty(i) &&
-                   !data_type_is_hidden(i) &&
-                   usage.buckets[i]) {
-                       bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
-                               __bch2_data_types[i], usage.buckets[i]);
-                       ret = -EBUSY;
-                       goto err;
-               }
-
-       ret = bch2_dev_remove_alloc(c, ca);
-       bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
-       if (ret)
-               goto err;
-
-       /*
-        * We need to flush the entire journal to get rid of keys that reference
-        * the device being removed before removing the superblock entry
-        */
-       bch2_journal_flush_all_pins(&c->journal);
-
-       /*
-        * this is really just needed for the bch2_replicas_gc_(start|end)
-        * calls, and could be cleaned up:
-        */
-       ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
-       bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
-       if (ret)
-               goto err;
-
-       ret = bch2_journal_flush(&c->journal);
-       bch_err_msg(ca, ret, "bch2_journal_flush()");
-       if (ret)
-               goto err;
-
-       ret = bch2_replicas_gc2(c);
-       bch_err_msg(ca, ret, "bch2_replicas_gc2()");
-       if (ret)
-               goto err;
-
-       data = bch2_dev_has_data(c, ca);
-       if (data) {
-               struct printbuf data_has = PRINTBUF;
-
-               prt_bitflags(&data_has, __bch2_data_types, data);
-               bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
-               printbuf_exit(&data_has);
-               ret = -EBUSY;
-               goto err;
-       }
-
-       __bch2_dev_offline(c, ca);
-
-       mutex_lock(&c->sb_lock);
-       rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
-       mutex_unlock(&c->sb_lock);
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-       percpu_ref_kill(&ca->ref);
-#else
-       ca->dying = true;
-       bch2_dev_put(ca);
-#endif
-       wait_for_completion(&ca->ref_completion);
-
-       bch2_dev_free(ca);
-
-       /*
-        * Free this device's slot in the bch_member array - all pointers to
-        * this device must be gone:
-        */
-       mutex_lock(&c->sb_lock);
-       m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
-
-       if (fast_device_removal)
-               m->uuid = BCH_SB_MEMBER_DELETED_UUID;
-       else
-               memset(&m->uuid, 0, sizeof(m->uuid));
-
-       bch2_write_super(c);
-
-       mutex_unlock(&c->sb_lock);
-       up_write(&c->state_lock);
-       return 0;
-err:
-       if (test_bit(BCH_FS_rw, &c->flags) &&
-           ca->mi.state == BCH_MEMBER_STATE_rw &&
-           !enumerated_ref_is_zero(&ca->io_ref[READ]))
-               __bch2_dev_read_write(c, ca);
-       up_write(&c->state_lock);
-       return ret;
-}
-
-/* Add new device to running filesystem: */
-int bch2_dev_add(struct bch_fs *c, const char *path)
-{
-       struct bch_opts opts = bch2_opts_empty();
-       struct bch_sb_handle sb = {};
-       struct bch_dev *ca = NULL;
-       struct printbuf errbuf = PRINTBUF;
-       struct printbuf label = PRINTBUF;
-       int ret = 0;
-
-       ret = bch2_read_super(path, &opts, &sb);
-       bch_err_msg(c, ret, "reading super");
-       if (ret)
-               goto err;
-
-       struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
-
-       if (BCH_MEMBER_GROUP(&dev_mi)) {
-               bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
-               if (label.allocation_failure) {
-                       ret = -ENOMEM;
-                       goto err;
-               }
-       }
-
-       if (list_empty(&c->list)) {
-               mutex_lock(&bch_fs_list_lock);
-               if (__bch2_uuid_to_fs(c->sb.uuid))
-                       ret = bch_err_throw(c, filesystem_uuid_already_open);
-               else
-                       list_add(&c->list, &bch_fs_list);
-               mutex_unlock(&bch_fs_list_lock);
-
-               if (ret) {
-                       bch_err(c, "filesystem UUID already open");
-                       goto err;
-               }
-       }
-
-       ret = bch2_dev_may_add(sb.sb, c);
-       if (ret)
-               goto err;
-
-       ca = __bch2_dev_alloc(c, &dev_mi);
-       if (!ca) {
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       ret = __bch2_dev_attach_bdev(ca, &sb);
-       if (ret)
-               goto err;
-
-       down_write(&c->state_lock);
-       mutex_lock(&c->sb_lock);
-       SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
-
-       ret = bch2_sb_from_fs(c, ca);
-       bch_err_msg(c, ret, "setting up new superblock");
-       if (ret)
-               goto err_unlock;
-
-       if (dynamic_fault("bcachefs:add:no_slot"))
-               goto err_unlock;
-
-       ret = bch2_sb_member_alloc(c);
-       if (ret < 0) {
-               bch_err_msg(c, ret, "setting up new superblock");
-               goto err_unlock;
-       }
-       unsigned dev_idx = ret;
-       ret = 0;
-
-       /* success: */
-
-       dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
-       *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
-
-       ca->disk_sb.sb->dev_idx = dev_idx;
-       bch2_dev_attach(c, ca, dev_idx);
-
-       if (BCH_MEMBER_GROUP(&dev_mi)) {
-               ret = __bch2_dev_group_set(c, ca, label.buf);
-               bch_err_msg(c, ret, "creating new label");
-               if (ret)
-                       goto err_unlock;
-       }
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       if (test_bit(BCH_FS_started, &c->flags)) {
-               ret = bch2_dev_usage_init(ca, false);
-               if (ret)
-                       goto err_late;
-
-               ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
-               bch_err_msg(ca, ret, "marking new superblock");
-               if (ret)
-                       goto err_late;
-
-               ret = bch2_fs_freespace_init(c);
-               bch_err_msg(ca, ret, "initializing free space");
-               if (ret)
-                       goto err_late;
-
-               if (ca->mi.state == BCH_MEMBER_STATE_rw)
-                       __bch2_dev_read_write(c, ca);
-
-               ret = bch2_dev_journal_alloc(ca, false);
-               bch_err_msg(c, ret, "allocating journal");
-               if (ret)
-                       goto err_late;
-       }
-
-       /*
-        * We just changed the superblock UUID, invalidate cache and send a
-        * uevent to update /dev/disk/by-uuid
-        */
-       invalidate_bdev(ca->disk_sb.bdev);
-
-       char uuid_str[37];
-       snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid);
-
-       char *envp[] = {
-               "CHANGE=uuid",
-               uuid_str,
-               NULL,
-       };
-       kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp);
-
-       up_write(&c->state_lock);
-out:
-       printbuf_exit(&label);
-       printbuf_exit(&errbuf);
-       bch_err_fn(c, ret);
-       return ret;
-
-err_unlock:
-       mutex_unlock(&c->sb_lock);
-       up_write(&c->state_lock);
-err:
-       if (ca)
-               bch2_dev_free(ca);
-       bch2_free_super(&sb);
-       goto out;
-err_late:
-       up_write(&c->state_lock);
-       ca = NULL;
-       goto err;
-}
-
-/* Hot add existing device to running filesystem: */
-int bch2_dev_online(struct bch_fs *c, const char *path)
-{
-       struct bch_opts opts = bch2_opts_empty();
-       struct bch_sb_handle sb = { NULL };
-       struct bch_dev *ca;
-       unsigned dev_idx;
-       int ret;
-
-       down_write(&c->state_lock);
-
-       ret = bch2_read_super(path, &opts, &sb);
-       if (ret) {
-               up_write(&c->state_lock);
-               return ret;
-       }
-
-       dev_idx = sb.sb->dev_idx;
-
-       ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
-       bch_err_msg(c, ret, "bringing %s online", path);
-       if (ret)
-               goto err;
-
-       ret = bch2_dev_attach_bdev(c, &sb);
-       if (ret)
-               goto err;
-
-       ca = bch2_dev_locked(c, dev_idx);
-
-       ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
-       bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
-       if (ret)
-               goto err;
-
-       if (ca->mi.state == BCH_MEMBER_STATE_rw)
-               __bch2_dev_read_write(c, ca);
-
-       if (!ca->mi.freespace_initialized) {
-               ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
-               bch_err_msg(ca, ret, "initializing free space");
-               if (ret)
-                       goto err;
-       }
-
-       if (!ca->journal.nr) {
-               ret = bch2_dev_journal_alloc(ca, false);
-               bch_err_msg(ca, ret, "allocating journal");
-               if (ret)
-                       goto err;
-       }
-
-       mutex_lock(&c->sb_lock);
-       bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
-               cpu_to_le64(ktime_get_real_seconds());
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       up_write(&c->state_lock);
-       return 0;
-err:
-       up_write(&c->state_lock);
-       bch2_free_super(&sb);
-       return ret;
-}
-
-int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
-       down_write(&c->state_lock);
-
-       if (!bch2_dev_is_online(ca)) {
-               bch_err(ca, "Already offline");
-               up_write(&c->state_lock);
-               return 0;
-       }
-
-       if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
-               bch_err(ca, "Cannot offline required disk");
-               up_write(&c->state_lock);
-               return bch_err_throw(c, device_state_not_allowed);
-       }
-
-       __bch2_dev_offline(c, ca);
-
-       up_write(&c->state_lock);
-       return 0;
-}
-
-static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets)
-{
-       struct bch_fs *c = ca->fs;
-       u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 };
-
-       return bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
-                       bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
-                                                 .dev = ca->dev_idx,
-                                                 .data_type = BCH_DATA_free)) ?:
-               bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets);
-}
-
-int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
-{
-       struct bch_member *m;
-       u64 old_nbuckets;
-       int ret = 0;
-
-       down_write(&c->state_lock);
-       old_nbuckets = ca->mi.nbuckets;
-
-       if (nbuckets < ca->mi.nbuckets) {
-               bch_err(ca, "Cannot shrink yet");
-               ret = -EINVAL;
-               goto err;
-       }
-
-       if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
-               bch_err(ca, "New device size too big (%llu greater than max %u)",
-                       nbuckets, BCH_MEMBER_NBUCKETS_MAX);
-               ret = bch_err_throw(c, device_size_too_big);
-               goto err;
-       }
-
-       if (bch2_dev_is_online(ca) &&
-           get_capacity(ca->disk_sb.bdev->bd_disk) <
-           ca->mi.bucket_size * nbuckets) {
-               bch_err(ca, "New size larger than device");
-               ret = bch_err_throw(c, device_size_too_small);
-               goto err;
-       }
-
-       ret = bch2_dev_buckets_resize(c, ca, nbuckets);
-       bch_err_msg(ca, ret, "resizing buckets");
-       if (ret)
-               goto err;
-
-       ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
-       if (ret)
-               goto err;
-
-       mutex_lock(&c->sb_lock);
-       m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-       m->nbuckets = cpu_to_le64(nbuckets);
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       if (ca->mi.freespace_initialized) {
-               ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets);
-               if (ret)
-                       goto err;
-       }
-
-       bch2_recalc_capacity(c);
-err:
-       up_write(&c->state_lock);
-       return ret;
-}
-
-int bch2_fs_resize_on_mount(struct bch_fs *c)
-{
-       for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) {
-               u64 old_nbuckets = ca->mi.nbuckets;
-               u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk),
-                                        ca->mi.bucket_size);
-
-               if (ca->mi.resize_on_mount &&
-                   new_nbuckets > ca->mi.nbuckets) {
-                       bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size);
-                       int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets);
-                       bch_err_fn(ca, ret);
-                       if (ret) {
-                               enumerated_ref_put(&ca->io_ref[READ],
-                                                  BCH_DEV_READ_REF_fs_resize_on_mount);
-                               up_write(&c->state_lock);
-                               return ret;
-                       }
-
-                       mutex_lock(&c->sb_lock);
-                       struct bch_member *m =
-                               bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-                       m->nbuckets = cpu_to_le64(new_nbuckets);
-                       SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false);
-
-                       c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image));
-                       bch2_write_super(c);
-                       mutex_unlock(&c->sb_lock);
-
-                       if (ca->mi.freespace_initialized) {
-                               ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets);
-                               if (ret) {
-                                       enumerated_ref_put(&ca->io_ref[READ],
-                                                       BCH_DEV_READ_REF_fs_resize_on_mount);
-                                       up_write(&c->state_lock);
-                                       return ret;
-                               }
-                       }
-               }
-       }
-       return 0;
-}
-
-/* return with ref on ca->ref: */
-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
-{
-       if (!strncmp(name, "/dev/", strlen("/dev/")))
-               name += strlen("/dev/");
-
-       for_each_member_device(c, ca)
-               if (!strcmp(name, ca->name))
-                       return ca;
-       return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
-}
-
-/* blk_holder_ops: */
-
-static struct bch_fs *bdev_get_fs(struct block_device *bdev)
-       __releases(&bdev->bd_holder_lock)
-{
-       struct bch_sb_handle_holder *holder = bdev->bd_holder;
-       struct bch_fs *c = holder->c;
-
-       if (c && !bch2_ro_ref_tryget(c))
-               c = NULL;
-
-       mutex_unlock(&bdev->bd_holder_lock);
-
-       if (c)
-               wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
-       return c;
-}
-
-/* returns with ref on ca->ref */
-static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
-{
-       for_each_member_device(c, ca)
-               if (ca->disk_sb.bdev == bdev)
-                       return ca;
-       return NULL;
-}
-
-static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
-{
-       struct bch_fs *c = bdev_get_fs(bdev);
-       if (!c)
-               return;
-
-       struct super_block *sb = c->vfs_sb;
-       if (sb) {
-               /*
-                * Not necessary, c->ro_ref guards against the filesystem being
-                * unmounted - we only take this to avoid a warning in
-                * sync_filesystem:
-                */
-               down_read(&sb->s_umount);
-       }
-
-       down_write(&c->state_lock);
-       struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
-       if (!ca)
-               goto unlock;
-
-       bool dev = bch2_dev_state_allowed(c, ca,
-                                         BCH_MEMBER_STATE_failed,
-                                         BCH_FORCE_IF_DEGRADED);
-
-       if (!dev && sb) {
-               if (!surprise)
-                       sync_filesystem(sb);
-               shrink_dcache_sb(sb);
-               evict_inodes(sb);
-       }
-
-       struct printbuf buf = PRINTBUF;
-       __bch2_log_msg_start(ca->name, &buf);
-
-       prt_printf(&buf, "offline from block layer");
-
-       if (dev) {
-               __bch2_dev_offline(c, ca);
-       } else {
-               bch2_journal_flush(&c->journal);
-               bch2_fs_emergency_read_only2(c, &buf);
-       }
-
-       bch2_print_str(c, KERN_ERR, buf.buf);
-       printbuf_exit(&buf);
-
-       bch2_dev_put(ca);
-unlock:
-       if (sb)
-               up_read(&sb->s_umount);
-       up_write(&c->state_lock);
-       bch2_ro_ref_put(c);
-}
-
-static void bch2_fs_bdev_sync(struct block_device *bdev)
-{
-       struct bch_fs *c = bdev_get_fs(bdev);
-       if (!c)
-               return;
-
-       struct super_block *sb = c->vfs_sb;
-       if (sb) {
-               /*
-                * Not necessary, c->ro_ref guards against the filesystem being
-                * unmounted - we only take this to avoid a warning in
-                * sync_filesystem:
-                */
-               down_read(&sb->s_umount);
-               sync_filesystem(sb);
-               up_read(&sb->s_umount);
-       }
-
-       bch2_ro_ref_put(c);
-}
-
-const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
-       .mark_dead              = bch2_fs_bdev_mark_dead,
-       .sync                   = bch2_fs_bdev_sync,
-};
-
-/* Filesystem open: */
-
-static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
-{
-       return  cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
-               cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
-}
-
-struct bch_fs *bch2_fs_open(darray_const_str *devices,
-                           struct bch_opts *opts)
-{
-       bch_sb_handles sbs = {};
-       struct bch_fs *c = NULL;
-       struct bch_sb_handle *best = NULL;
-       struct printbuf errbuf = PRINTBUF;
-       int ret = 0;
-
-       if (!try_module_get(THIS_MODULE))
-               return ERR_PTR(-ENODEV);
-
-       if (!devices->nr) {
-               ret = -EINVAL;
-               goto err;
-       }
-
-       ret = darray_make_room(&sbs, devices->nr);
-       if (ret)
-               goto err;
-
-       darray_for_each(*devices, i) {
-               struct bch_sb_handle sb = { NULL };
-
-               ret = bch2_read_super(*i, opts, &sb);
-               if (ret)
-                       goto err;
-
-               BUG_ON(darray_push(&sbs, sb));
-       }
-
-       if (opts->nochanges && !opts->read_only) {
-               ret = bch_err_throw(c, erofs_nochanges);
-               goto err_print;
-       }
-
-       darray_for_each(sbs, sb)
-               if (!best || sb_cmp(sb->sb, best->sb) > 0)
-                       best = sb;
-
-       darray_for_each_reverse(sbs, sb) {
-               ret = bch2_dev_in_fs(best, sb, opts);
-
-               if (ret == -BCH_ERR_device_has_been_removed ||
-                   ret == -BCH_ERR_device_splitbrain) {
-                       bch2_free_super(sb);
-                       darray_remove_item(&sbs, sb);
-                       best -= best > sb;
-                       ret = 0;
-                       continue;
-               }
-
-               if (ret)
-                       goto err_print;
-       }
-
-       c = bch2_fs_alloc(best->sb, opts, &sbs);
-       ret = PTR_ERR_OR_ZERO(c);
-       if (ret)
-               goto err;
-
-       down_write(&c->state_lock);
-       darray_for_each(sbs, sb) {
-               ret = bch2_dev_attach_bdev(c, sb);
-               if (ret) {
-                       up_write(&c->state_lock);
-                       goto err;
-               }
-       }
-       up_write(&c->state_lock);
-
-       if (!c->opts.nostart) {
-               ret = bch2_fs_start(c);
-               if (ret)
-                       goto err;
-       }
-out:
-       darray_for_each(sbs, sb)
-               bch2_free_super(sb);
-       darray_exit(&sbs);
-       printbuf_exit(&errbuf);
-       module_put(THIS_MODULE);
-       return c;
-err_print:
-       pr_err("bch_fs_open err opening %s: %s",
-              devices->data[0], bch2_err_str(ret));
-err:
-       if (!IS_ERR_OR_NULL(c))
-               bch2_fs_stop(c);
-       c = ERR_PTR(ret);
-       goto out;
-}
-
-/* Global interfaces/init */
-
-static void bcachefs_exit(void)
-{
-       bch2_debug_exit();
-       bch2_vfs_exit();
-       bch2_chardev_exit();
-       bch2_btree_key_cache_exit();
-       if (bcachefs_kset)
-               kset_unregister(bcachefs_kset);
-}
-
-static int __init bcachefs_init(void)
-{
-       bch2_bkey_pack_test();
-
-       if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
-           bch2_btree_key_cache_init() ||
-           bch2_chardev_init() ||
-           bch2_vfs_init() ||
-           bch2_debug_init())
-               goto err;
-
-       return 0;
-err:
-       bcachefs_exit();
-       return -ENOMEM;
-}
-
-#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name);
-BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
-static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp)
-{
-       /* Match bool exactly, by re-using it. */
-       struct static_key *key = kp->arg;
-       struct kernel_param boolkp = *kp;
-       bool v;
-       int ret;
-
-       boolkp.arg = &v;
-
-       ret = param_set_bool(val, &boolkp);
-       if (ret)
-               return ret;
-       if (v)
-               static_key_enable(key);
-       else
-               static_key_disable(key);
-       return 0;
-}
-
-static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp)
-{
-       struct static_key *key = kp->arg;
-       return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y');
-}
-
-static const struct kernel_param_ops bch2_param_ops_static_key_t = {
-       .flags = KERNEL_PARAM_OPS_FL_NOARG,
-       .set = bch2_param_set_static_key_t,
-       .get = bch2_param_get_static_key_t,
-};
-
-#define BCH_DEBUG_PARAM(name, description)                             \
-       module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\
-       __MODULE_PARM_TYPE(name, "static_key_t");                       \
-       MODULE_PARM_DESC(name, description);
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-__maybe_unused
-static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
-module_param_named(version, bch2_metadata_version, uint, 0444);
-
-module_exit(bcachefs_exit);
-module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
deleted file mode 100644 (file)
index e90bab9..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_H
-#define _BCACHEFS_SUPER_H
-
-#include "extents.h"
-
-#include "bcachefs_ioctl.h"
-
-#include <linux/math64.h>
-
-extern const char * const bch2_fs_flag_strs[];
-extern const char * const bch2_write_refs[];
-extern const char * const bch2_dev_read_refs[];
-extern const char * const bch2_dev_write_refs[];
-
-struct bch_fs *bch2_dev_to_fs(dev_t);
-struct bch_fs *bch2_uuid_to_fs(__uuid_t);
-
-bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
-                          enum bch_member_state, int);
-int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
-                       enum bch_member_state, int);
-int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
-                     enum bch_member_state, int);
-
-int bch2_dev_fail(struct bch_dev *, int);
-int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_add(struct bch_fs *, const char *);
-int bch2_dev_online(struct bch_fs *, const char *);
-int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
-struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
-
-bool bch2_fs_emergency_read_only(struct bch_fs *);
-bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *);
-
-bool bch2_fs_emergency_read_only_locked(struct bch_fs *);
-void bch2_fs_read_only(struct bch_fs *);
-
-int bch2_fs_read_write(struct bch_fs *);
-int bch2_fs_read_write_early(struct bch_fs *);
-
-int bch2_fs_resize_on_mount(struct bch_fs *);
-
-void __bch2_fs_stop(struct bch_fs *);
-void bch2_fs_free(struct bch_fs *);
-void bch2_fs_stop(struct bch_fs *);
-
-int bch2_fs_init_rw(struct bch_fs *);
-int bch2_fs_start(struct bch_fs *);
-struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *);
-
-extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
-
-#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
deleted file mode 100644 (file)
index 3a899f7..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_TYPES_H
-#define _BCACHEFS_SUPER_TYPES_H
-
-struct bch_fs;
-
-struct bch_sb_handle_holder {
-       struct bch_fs           *c;
-};
-
-struct bch_sb_handle {
-       struct bch_sb           *sb;
-       struct file             *s_bdev_file;
-       struct block_device     *bdev;
-       char                    *sb_name;
-       struct bio              *bio;
-       struct bch_sb_handle_holder *holder;
-       size_t                  buffer_size;
-       blk_mode_t              mode;
-       unsigned                have_layout:1;
-       unsigned                have_bio:1;
-       unsigned                fs_sb:1;
-       u64                     seq;
-};
-
-struct bch_devs_mask {
-       unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
-};
-
-struct bch_devs_list {
-       u8                      nr;
-       u8                      data[BCH_BKEY_PTRS_MAX];
-};
-
-#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
deleted file mode 100644 (file)
index 0584837..0000000
+++ /dev/null
@@ -1,914 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcache sysfs interfaces
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#ifndef NO_BCACHEFS_SYSFS
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "sysfs.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "clock.h"
-#include "compress.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "inode.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "move.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "opts.h"
-#include "rebalance.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-errors.h"
-#include "super-io.h"
-#include "tests.h"
-
-#include <linux/blkdev.h>
-#include <linux/sort.h>
-#include <linux/sched/clock.h>
-
-#include "util.h"
-
-#define SYSFS_OPS(type)                                                        \
-const struct sysfs_ops type ## _sysfs_ops = {                          \
-       .show   = type ## _show,                                        \
-       .store  = type ## _store                                        \
-}
-
-#define SHOW(fn)                                                       \
-static ssize_t fn ## _to_text(struct printbuf *,                       \
-                             struct kobject *, struct attribute *);    \
-                                                                       \
-static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
-                          char *buf)                                   \
-{                                                                      \
-       struct printbuf out = PRINTBUF;                                 \
-       ssize_t ret = fn ## _to_text(&out, kobj, attr);                 \
-                                                                       \
-       if (out.pos && out.buf[out.pos - 1] != '\n')                    \
-               prt_newline(&out);                                      \
-                                                                       \
-       if (!ret && out.allocation_failure)                             \
-               ret = -ENOMEM;                                          \
-                                                                       \
-       if (!ret) {                                                     \
-               ret = min_t(size_t, out.pos, PAGE_SIZE - 1);            \
-               memcpy(buf, out.buf, ret);                              \
-       }                                                               \
-       printbuf_exit(&out);                                            \
-       return bch2_err_class(ret);                                     \
-}                                                                      \
-                                                                       \
-static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
-                             struct attribute *attr)
-
-#define STORE(fn)                                                      \
-static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
-                           const char *, size_t);                      \
-                                                                       \
-static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
-                           const char *buf, size_t size)               \
-{                                                                      \
-       return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \
-}                                                                      \
-                                                                       \
-static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
-                                 const char *buf, size_t size)
-
-#define __sysfs_attribute(_name, _mode)                                        \
-       static struct attribute sysfs_##_name =                         \
-               { .name = #_name, .mode = _mode }
-
-#define write_attribute(n)     __sysfs_attribute(n, 0200)
-#define read_attribute(n)      __sysfs_attribute(n, 0444)
-#define rw_attribute(n)                __sysfs_attribute(n, 0644)
-
-#define sysfs_printf(file, fmt, ...)                                   \
-do {                                                                   \
-       if (attr == &sysfs_ ## file)                                    \
-               prt_printf(out, fmt "\n", __VA_ARGS__);                 \
-} while (0)
-
-#define sysfs_print(file, var)                                         \
-do {                                                                   \
-       if (attr == &sysfs_ ## file)                                    \
-               snprint(out, var);                                      \
-} while (0)
-
-#define sysfs_hprint(file, val)                                                \
-do {                                                                   \
-       if (attr == &sysfs_ ## file)                                    \
-               prt_human_readable_s64(out, val);                       \
-} while (0)
-
-#define sysfs_strtoul(file, var)                                       \
-do {                                                                   \
-       if (attr == &sysfs_ ## file)                                    \
-               return strtoul_safe(buf, var) ?: (ssize_t) size;        \
-} while (0)
-
-#define sysfs_strtoul_clamp(file, var, min, max)                       \
-do {                                                                   \
-       if (attr == &sysfs_ ## file)                                    \
-               return strtoul_safe_clamp(buf, var, min, max)           \
-                       ?: (ssize_t) size;                              \
-} while (0)
-
-#define strtoul_or_return(cp)                                          \
-({                                                                     \
-       unsigned long _v;                                               \
-       int _r = kstrtoul(cp, 10, &_v);                                 \
-       if (_r)                                                         \
-               return _r;                                              \
-       _v;                                                             \
-})
-
-write_attribute(trigger_gc);
-write_attribute(trigger_discards);
-write_attribute(trigger_invalidates);
-write_attribute(trigger_journal_commit);
-write_attribute(trigger_journal_flush);
-write_attribute(trigger_journal_writes);
-write_attribute(trigger_btree_cache_shrink);
-write_attribute(trigger_btree_key_cache_shrink);
-write_attribute(trigger_btree_updates);
-write_attribute(trigger_freelist_wakeup);
-write_attribute(trigger_recalc_capacity);
-write_attribute(trigger_delete_dead_snapshots);
-write_attribute(trigger_emergency_read_only);
-read_attribute(gc_gens_pos);
-
-read_attribute(uuid);
-read_attribute(minor);
-read_attribute(flags);
-read_attribute(first_bucket);
-read_attribute(nbuckets);
-read_attribute(io_done);
-read_attribute(io_errors);
-write_attribute(io_errors_reset);
-
-read_attribute(io_latency_read);
-read_attribute(io_latency_write);
-read_attribute(io_latency_stats_read);
-read_attribute(io_latency_stats_write);
-read_attribute(congested);
-
-read_attribute(btree_write_stats);
-
-read_attribute(btree_cache_size);
-read_attribute(compression_stats);
-read_attribute(errors);
-read_attribute(journal_debug);
-read_attribute(btree_cache);
-read_attribute(btree_key_cache);
-read_attribute(btree_reserve_cache);
-read_attribute(open_buckets);
-read_attribute(open_buckets_partial);
-read_attribute(nocow_lock_table);
-
-read_attribute(read_refs);
-read_attribute(write_refs);
-
-read_attribute(internal_uuid);
-read_attribute(disk_groups);
-
-read_attribute(has_data);
-read_attribute(alloc_debug);
-read_attribute(usage_base);
-
-#define x(t, n, ...) read_attribute(t);
-BCH_PERSISTENT_COUNTERS()
-#undef x
-
-rw_attribute(label);
-
-read_attribute(copy_gc_wait);
-
-sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_status);
-read_attribute(snapshot_delete_status);
-read_attribute(recovery_status);
-
-read_attribute(new_stripes);
-
-read_attribute(io_timers_read);
-read_attribute(io_timers_write);
-
-read_attribute(moving_ctxts);
-
-#ifdef CONFIG_BCACHEFS_TESTS
-write_attribute(perf_test);
-#endif /* CONFIG_BCACHEFS_TESTS */
-
-#define x(_name)                                               \
-       static struct attribute sysfs_time_stat_##_name =               \
-               { .name = #_name, .mode = 0644 };
-       BCH_TIME_STATS()
-#undef x
-
-static size_t bch2_btree_cache_size(struct bch_fs *c)
-{
-       struct btree_cache *bc = &c->btree_cache;
-       size_t ret = 0;
-       struct btree *b;
-
-       mutex_lock(&bc->lock);
-       list_for_each_entry(b, &bc->live[0].list, list)
-               ret += btree_buf_bytes(b);
-       list_for_each_entry(b, &bc->live[1].list, list)
-               ret += btree_buf_bytes(b);
-       list_for_each_entry(b, &bc->freeable, list)
-               ret += btree_buf_bytes(b);
-       mutex_unlock(&bc->lock);
-       return ret;
-}
-
-static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       prt_str(out, "type");
-       printbuf_tabstop_push(out, 12);
-       printbuf_tabstop_push(out, 16);
-       printbuf_tabstop_push(out, 16);
-       printbuf_tabstop_push(out, 24);
-       prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
-
-       for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
-               struct disk_accounting_pos a;
-               disk_accounting_key_init(a, compression, .type = i);
-               struct bpos p = disk_accounting_pos_to_bpos(&a);
-               u64 v[3];
-               bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
-
-               u64 nr_extents                  = v[0];
-               u64 sectors_uncompressed        = v[1];
-               u64 sectors_compressed          = v[2];
-
-               bch2_prt_compression_type(out, i);
-               prt_tab(out);
-
-               prt_human_readable_u64(out, sectors_compressed << 9);
-               prt_tab_rjust(out);
-
-               prt_human_readable_u64(out, sectors_uncompressed << 9);
-               prt_tab_rjust(out);
-
-               prt_human_readable_u64(out, nr_extents
-                                      ? div64_u64(sectors_uncompressed << 9, nr_extents)
-                                      : 0);
-               prt_tab_rjust(out);
-               prt_newline(out);
-       }
-
-       return 0;
-}
-
-static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       bch2_btree_id_to_text(out, c->gc_gens_btree);
-       prt_printf(out, ": ");
-       bch2_bpos_to_text(out, c->gc_gens_pos);
-       prt_printf(out, "\n");
-}
-
-static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       struct bch_fs_usage_base b = {};
-
-       acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64));
-
-       prt_printf(out, "hidden:\t\t%llu\n",    b.hidden);
-       prt_printf(out, "btree:\t\t%llu\n",     b.btree);
-       prt_printf(out, "data:\t\t%llu\n",      b.data);
-       prt_printf(out, "cached:\t%llu\n",      b.cached);
-       prt_printf(out, "reserved:\t\t%llu\n",  b.reserved);
-       prt_printf(out, "nr_inodes:\t%llu\n",   b.nr_inodes);
-}
-
-SHOW(bch2_fs)
-{
-       struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
-       sysfs_print(minor,                      c->minor);
-       sysfs_printf(internal_uuid, "%pU",      c->sb.uuid.b);
-
-       if (attr == &sysfs_flags)
-               prt_bitflags(out, bch2_fs_flag_strs, c->flags);
-
-       sysfs_hprint(btree_cache_size,          bch2_btree_cache_size(c));
-
-       if (attr == &sysfs_btree_write_stats)
-               bch2_btree_write_stats_to_text(out, c);
-
-       if (attr == &sysfs_gc_gens_pos)
-               bch2_gc_gens_pos_to_text(out, c);
-
-       sysfs_pd_controller_show(rebalance,     &c->rebalance.pd); /* XXX */
-
-       if (attr == &sysfs_copy_gc_wait)
-               bch2_copygc_wait_to_text(out, c);
-
-       if (attr == &sysfs_rebalance_status)
-               bch2_rebalance_status_to_text(out, c);
-
-       if (attr == &sysfs_snapshot_delete_status)
-               bch2_snapshot_delete_status_to_text(out, c);
-
-       if (attr == &sysfs_recovery_status)
-               bch2_recovery_pass_status_to_text(out, c);
-
-       /* Debugging: */
-
-       if (attr == &sysfs_journal_debug)
-               bch2_journal_debug_to_text(out, &c->journal);
-
-       if (attr == &sysfs_btree_cache)
-               bch2_btree_cache_to_text(out, &c->btree_cache);
-
-       if (attr == &sysfs_btree_key_cache)
-               bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
-
-       if (attr == &sysfs_btree_reserve_cache)
-               bch2_btree_reserve_cache_to_text(out, c);
-
-       if (attr == &sysfs_open_buckets)
-               bch2_open_buckets_to_text(out, c, NULL);
-
-       if (attr == &sysfs_open_buckets_partial)
-               bch2_open_buckets_partial_to_text(out, c);
-
-       if (attr == &sysfs_compression_stats)
-               bch2_compression_stats_to_text(out, c);
-
-       if (attr == &sysfs_errors)
-               bch2_fs_errors_to_text(out, c);
-
-       if (attr == &sysfs_new_stripes)
-               bch2_new_stripes_to_text(out, c);
-
-       if (attr == &sysfs_io_timers_read)
-               bch2_io_timers_to_text(out, &c->io_clock[READ]);
-
-       if (attr == &sysfs_io_timers_write)
-               bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
-
-       if (attr == &sysfs_moving_ctxts)
-               bch2_fs_moving_ctxts_to_text(out, c);
-
-       if (attr == &sysfs_write_refs)
-               enumerated_ref_to_text(out, &c->writes, bch2_write_refs);
-
-       if (attr == &sysfs_nocow_lock_table)
-               bch2_nocow_locks_to_text(out, &c->nocow_locks);
-
-       if (attr == &sysfs_disk_groups)
-               bch2_disk_groups_to_text(out, c);
-
-       if (attr == &sysfs_alloc_debug)
-               bch2_fs_alloc_debug_to_text(out, c);
-
-       if (attr == &sysfs_usage_base)
-               bch2_fs_usage_base_to_text(out, c);
-
-       return 0;
-}
-
-STORE(bch2_fs)
-{
-       struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
-       sysfs_pd_controller_store(rebalance,    &c->rebalance.pd);
-
-       /* Debugging: */
-
-       if (!test_bit(BCH_FS_started, &c->flags))
-               return -EPERM;
-
-       /* Debugging: */
-
-       if (attr == &sysfs_trigger_btree_updates)
-               queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
-
-       if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))
-               return -EROFS;
-
-       if (attr == &sysfs_trigger_btree_cache_shrink) {
-               struct btree_cache *bc = &c->btree_cache;
-               struct shrink_control sc;
-
-               sc.gfp_mask = GFP_KERNEL;
-               sc.nr_to_scan = strtoul_or_return(buf);
-               bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
-       }
-
-       if (attr == &sysfs_trigger_btree_key_cache_shrink) {
-               struct shrink_control sc;
-
-               sc.gfp_mask = GFP_KERNEL;
-               sc.nr_to_scan = strtoul_or_return(buf);
-               c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
-       }
-
-       if (attr == &sysfs_trigger_gc)
-               bch2_gc_gens(c);
-
-       if (attr == &sysfs_trigger_discards)
-               bch2_do_discards(c);
-
-       if (attr == &sysfs_trigger_invalidates)
-               bch2_do_invalidates(c);
-
-       if (attr == &sysfs_trigger_journal_commit)
-               bch2_journal_flush(&c->journal);
-
-       if (attr == &sysfs_trigger_journal_flush) {
-               bch2_journal_flush_all_pins(&c->journal);
-               bch2_journal_meta(&c->journal);
-       }
-
-       if (attr == &sysfs_trigger_journal_writes)
-               bch2_journal_do_writes(&c->journal);
-
-       if (attr == &sysfs_trigger_freelist_wakeup)
-               closure_wake_up(&c->freelist_wait);
-
-       if (attr == &sysfs_trigger_recalc_capacity) {
-               down_read(&c->state_lock);
-               bch2_recalc_capacity(c);
-               up_read(&c->state_lock);
-       }
-
-       if (attr == &sysfs_trigger_delete_dead_snapshots)
-               __bch2_delete_dead_snapshots(c);
-
-       if (attr == &sysfs_trigger_emergency_read_only) {
-               struct printbuf buf = PRINTBUF;
-               bch2_log_msg_start(c, &buf);
-
-               prt_printf(&buf, "shutdown by sysfs\n");
-               bch2_fs_emergency_read_only2(c, &buf);
-               bch2_print_str(c, KERN_ERR, buf.buf);
-               printbuf_exit(&buf);
-       }
-
-#ifdef CONFIG_BCACHEFS_TESTS
-       if (attr == &sysfs_perf_test) {
-               char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
-               char *test              = strsep(&p, " \t\n");
-               char *nr_str            = strsep(&p, " \t\n");
-               char *threads_str       = strsep(&p, " \t\n");
-               unsigned threads;
-               u64 nr;
-               int ret = -EINVAL;
-
-               if (threads_str &&
-                   !(ret = kstrtouint(threads_str, 10, &threads)) &&
-                   !(ret = bch2_strtoull_h(nr_str, &nr)))
-                       ret = bch2_btree_perf_test(c, test, nr, threads);
-               kfree(tmp);
-
-               if (ret)
-                       size = ret;
-       }
-#endif
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
-       return size;
-}
-SYSFS_OPS(bch2_fs);
-
-struct attribute *bch2_fs_files[] = {
-       &sysfs_minor,
-       &sysfs_btree_cache_size,
-       &sysfs_btree_write_stats,
-
-       &sysfs_rebalance_status,
-       &sysfs_snapshot_delete_status,
-       &sysfs_recovery_status,
-
-       &sysfs_compression_stats,
-       &sysfs_errors,
-
-#ifdef CONFIG_BCACHEFS_TESTS
-       &sysfs_perf_test,
-#endif
-       NULL
-};
-
-/* counters dir */
-
-SHOW(bch2_fs_counters)
-{
-       struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
-       u64 counter = 0;
-       u64 counter_since_mount = 0;
-
-       printbuf_tabstop_push(out, 32);
-
-       #define x(t, n, f, ...) \
-               if (attr == &sysfs_##t) {                                       \
-                       counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
-                       counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
-                       if (f & TYPE_SECTORS) {                                 \
-                               counter <<= 9;                                  \
-                               counter_since_mount <<= 9;                      \
-                       }                                                       \
-                                                                               \
-                       prt_printf(out, "since mount:\t");                      \
-                       (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\
-                       prt_human_readable_u64(out, counter_since_mount);       \
-                       prt_newline(out);                                       \
-                                                                               \
-                       prt_printf(out, "since filesystem creation:\t");        \
-                       (f & TYPE_COUNTER) ? prt_u64(out, counter) :            \
-                       prt_human_readable_u64(out, counter);                   \
-                       prt_newline(out);                                       \
-               }
-       BCH_PERSISTENT_COUNTERS()
-       #undef x
-       return 0;
-}
-
-STORE(bch2_fs_counters) {
-       return 0;
-}
-
-SYSFS_OPS(bch2_fs_counters);
-
-struct attribute *bch2_fs_counters_files[] = {
-#define x(t, ...) \
-       &sysfs_##t,
-       BCH_PERSISTENT_COUNTERS()
-#undef x
-       NULL
-};
-/* internal dir - just a wrapper */
-
-SHOW(bch2_fs_internal)
-{
-       struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-
-       return bch2_fs_to_text(out, &c->kobj, attr);
-}
-
-STORE(bch2_fs_internal)
-{
-       struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-
-       return bch2_fs_store(&c->kobj, attr, buf, size);
-}
-SYSFS_OPS(bch2_fs_internal);
-
-struct attribute *bch2_fs_internal_files[] = {
-       &sysfs_flags,
-       &sysfs_journal_debug,
-       &sysfs_btree_cache,
-       &sysfs_btree_key_cache,
-       &sysfs_btree_reserve_cache,
-       &sysfs_new_stripes,
-       &sysfs_open_buckets,
-       &sysfs_open_buckets_partial,
-       &sysfs_write_refs,
-       &sysfs_nocow_lock_table,
-       &sysfs_io_timers_read,
-       &sysfs_io_timers_write,
-
-       &sysfs_trigger_gc,
-       &sysfs_trigger_discards,
-       &sysfs_trigger_invalidates,
-       &sysfs_trigger_journal_commit,
-       &sysfs_trigger_journal_flush,
-       &sysfs_trigger_journal_writes,
-       &sysfs_trigger_btree_cache_shrink,
-       &sysfs_trigger_btree_key_cache_shrink,
-       &sysfs_trigger_btree_updates,
-       &sysfs_trigger_freelist_wakeup,
-       &sysfs_trigger_recalc_capacity,
-       &sysfs_trigger_delete_dead_snapshots,
-       &sysfs_trigger_emergency_read_only,
-
-       &sysfs_gc_gens_pos,
-
-       &sysfs_copy_gc_wait,
-
-       sysfs_pd_controller_files(rebalance),
-
-       &sysfs_moving_ctxts,
-
-       &sysfs_internal_uuid,
-
-       &sysfs_disk_groups,
-       &sysfs_alloc_debug,
-       &sysfs_usage_base,
-       NULL
-};
-
-/* options */
-
-static ssize_t sysfs_opt_show(struct bch_fs *c,
-                             struct bch_dev *ca,
-                             enum bch_opt_id id,
-                             struct printbuf *out)
-{
-       const struct bch_option *opt = bch2_opt_table + id;
-       u64 v;
-
-       if (opt->flags & OPT_FS) {
-               v = bch2_opt_get_by_id(&c->opts, id);
-       } else if ((opt->flags & OPT_DEVICE) && opt->get_member)  {
-               v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx);
-       } else {
-               return -EINVAL;
-       }
-
-       bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
-       prt_char(out, '\n');
-       return 0;
-}
-
-static ssize_t sysfs_opt_store(struct bch_fs *c,
-                              struct bch_dev *ca,
-                              enum bch_opt_id id,
-                              const char *buf, size_t size)
-{
-       const struct bch_option *opt = bch2_opt_table + id;
-       int ret = 0;
-
-       /*
-        * We don't need to take c->writes for correctness, but it eliminates an
-        * unsightly error message in the dmesg log when we're RO:
-        */
-       if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)))
-               return -EROFS;
-
-       char *tmp = kstrdup(buf, GFP_KERNEL);
-       if (!tmp) {
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       u64 v;
-       ret =   bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
-               bch2_opt_hook_pre_set(c, ca, id, v);
-       kfree(tmp);
-
-       if (ret < 0)
-               goto err;
-
-       bool is_sb = opt->get_sb || opt->get_member;
-       bool changed = false;
-
-       if (is_sb) {
-               changed = bch2_opt_set_sb(c, ca, opt, v);
-       } else if (!ca) {
-               changed = bch2_opt_get_by_id(&c->opts, id) != v;
-       } else {
-               /* device options that aren't superblock options aren't
-                * supported */
-               BUG();
-       }
-
-       if (!ca)
-               bch2_opt_set_by_id(&c->opts, id, v);
-
-       if (changed)
-               bch2_opt_hook_post_set(c, ca, 0, &c->opts, id);
-
-       ret = size;
-err:
-       enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
-       return ret;
-}
-
-SHOW(bch2_fs_opts_dir)
-{
-       struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
-       int id = bch2_opt_lookup(attr->name);
-       if (id < 0)
-               return 0;
-
-       return sysfs_opt_show(c, NULL, id, out);
-}
-
-STORE(bch2_fs_opts_dir)
-{
-       struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
-       int id = bch2_opt_lookup(attr->name);
-       if (id < 0)
-               return 0;
-
-       return sysfs_opt_store(c, NULL, id, buf, size);
-}
-SYSFS_OPS(bch2_fs_opts_dir);
-
-struct attribute *bch2_fs_opts_dir_files[] = { NULL };
-
-int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
-{
-       for (const struct bch_option *i = bch2_opt_table;
-            i < bch2_opt_table + bch2_opts_nr;
-            i++) {
-               if (i->flags & OPT_HIDDEN)
-                       continue;
-               if (!(i->flags & type))
-                       continue;
-
-               int ret = sysfs_create_file(kobj, &i->attr);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-/* time stats */
-
-SHOW(bch2_fs_time_stats)
-{
-       struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define x(name)                                                                \
-       if (attr == &sysfs_time_stat_##name)                            \
-               bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
-       BCH_TIME_STATS()
-#undef x
-
-       return 0;
-}
-
-STORE(bch2_fs_time_stats)
-{
-       struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define x(name)                                                                \
-       if (attr == &sysfs_time_stat_##name)                            \
-               bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
-       BCH_TIME_STATS()
-#undef x
-       return size;
-}
-SYSFS_OPS(bch2_fs_time_stats);
-
-struct attribute *bch2_fs_time_stats_files[] = {
-#define x(name)                                                \
-       &sysfs_time_stat_##name,
-       BCH_TIME_STATS()
-#undef x
-       NULL
-};
-
-static const char * const bch2_rw[] = {
-       "read",
-       "write",
-       NULL
-};
-
-static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-       int rw, i;
-
-       for (rw = 0; rw < 2; rw++) {
-               prt_printf(out, "%s:\n", bch2_rw[rw]);
-
-               for (i = 1; i < BCH_DATA_NR; i++)
-                       prt_printf(out, "%-12s:%12llu\n",
-                              bch2_data_type_str(i),
-                              percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
-       }
-}
-
-SHOW(bch2_dev)
-{
-       struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
-       struct bch_fs *c = ca->fs;
-
-       sysfs_printf(uuid,              "%pU\n", ca->uuid.b);
-
-       sysfs_print(first_bucket,       ca->mi.first_bucket);
-       sysfs_print(nbuckets,           ca->mi.nbuckets);
-
-       if (attr == &sysfs_label) {
-               if (ca->mi.group)
-                       bch2_disk_path_to_text(out, c, ca->mi.group - 1);
-               prt_char(out, '\n');
-       }
-
-       if (attr == &sysfs_has_data) {
-               prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
-               prt_char(out, '\n');
-       }
-
-       if (attr == &sysfs_io_done)
-               dev_io_done_to_text(out, ca);
-
-       if (attr == &sysfs_io_errors)
-               bch2_dev_io_errors_to_text(out, ca);
-
-       sysfs_print(io_latency_read,            atomic64_read(&ca->cur_latency[READ]));
-       sysfs_print(io_latency_write,           atomic64_read(&ca->cur_latency[WRITE]));
-
-       if (attr == &sysfs_io_latency_stats_read)
-               bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
-
-       if (attr == &sysfs_io_latency_stats_write)
-               bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
-
-       sysfs_printf(congested,                 "%u%%",
-                    clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
-                    * 100 / CONGESTED_MAX);
-
-       if (attr == &sysfs_alloc_debug)
-               bch2_dev_alloc_debug_to_text(out, ca);
-
-       if (attr == &sysfs_open_buckets)
-               bch2_open_buckets_to_text(out, c, ca);
-
-       int opt_id = bch2_opt_lookup(attr->name);
-       if (opt_id >= 0)
-               return sysfs_opt_show(c, ca, opt_id, out);
-
-       if (attr == &sysfs_read_refs)
-               enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs);
-
-       if (attr == &sysfs_write_refs)
-               enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs);
-
-       return 0;
-}
-
-STORE(bch2_dev)
-{
-       struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
-       struct bch_fs *c = ca->fs;
-
-       if (attr == &sysfs_label) {
-               char *tmp;
-               int ret;
-
-               tmp = kstrdup(buf, GFP_KERNEL);
-               if (!tmp)
-                       return -ENOMEM;
-
-               ret = bch2_dev_group_set(c, ca, strim(tmp));
-               kfree(tmp);
-               if (ret)
-                       return ret;
-       }
-
-       if (attr == &sysfs_io_errors_reset)
-               bch2_dev_errors_reset(ca);
-
-       int opt_id = bch2_opt_lookup(attr->name);
-       if (opt_id >= 0)
-               return sysfs_opt_store(c, ca, opt_id, buf, size);
-
-       return size;
-}
-SYSFS_OPS(bch2_dev);
-
-struct attribute *bch2_dev_files[] = {
-       &sysfs_uuid,
-       &sysfs_first_bucket,
-       &sysfs_nbuckets,
-
-       /* settings: */
-       &sysfs_label,
-
-       &sysfs_has_data,
-       &sysfs_io_done,
-       &sysfs_io_errors,
-       &sysfs_io_errors_reset,
-
-       &sysfs_io_latency_read,
-       &sysfs_io_latency_write,
-       &sysfs_io_latency_stats_read,
-       &sysfs_io_latency_stats_write,
-       &sysfs_congested,
-
-       /* debug: */
-       &sysfs_alloc_debug,
-       &sysfs_open_buckets,
-
-       &sysfs_read_refs,
-       &sysfs_write_refs,
-       NULL
-};
-
-#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
deleted file mode 100644 (file)
index 303e043..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SYSFS_H_
-#define _BCACHEFS_SYSFS_H_
-
-#include <linux/sysfs.h>
-
-#ifndef NO_BCACHEFS_SYSFS
-
-struct attribute;
-struct sysfs_ops;
-
-extern struct attribute *bch2_fs_files[];
-extern struct attribute *bch2_fs_counters_files[];
-extern struct attribute *bch2_fs_internal_files[];
-extern struct attribute *bch2_fs_opts_dir_files[];
-extern struct attribute *bch2_fs_time_stats_files[];
-extern struct attribute *bch2_dev_files[];
-
-extern const struct sysfs_ops bch2_fs_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-extern const struct sysfs_ops bch2_dev_sysfs_ops;
-
-int bch2_opts_create_sysfs_files(struct kobject *, unsigned);
-
-#else
-
-static struct attribute *bch2_fs_files[] = {};
-static struct attribute *bch2_fs_counters_files[] = {};
-static struct attribute *bch2_fs_internal_files[] = {};
-static struct attribute *bch2_fs_opts_dir_files[] = {};
-static struct attribute *bch2_fs_time_stats_files[] = {};
-static struct attribute *bch2_dev_files[] = {};
-
-static const struct sysfs_ops bch2_fs_sysfs_ops;
-static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
-static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
-static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-static const struct sysfs_ops bch2_dev_sysfs_ops;
-
-static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
-{ return 0; }
-
-#endif /* NO_BCACHEFS_SYSFS */
-
-#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
deleted file mode 100644 (file)
index 782a05f..0000000
+++ /dev/null
@@ -1,891 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_BCACHEFS_TESTS
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "journal_reclaim.h"
-#include "snapshot.h"
-#include "tests.h"
-
-#include "linux/kthread.h"
-#include "linux/random.h"
-
-static void delete_test_keys(struct bch_fs *c)
-{
-       int ret;
-
-       ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-                                     SPOS(0, 0, U32_MAX),
-                                     POS(0, U64_MAX),
-                                     0, NULL);
-       BUG_ON(ret);
-
-       ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-                                     SPOS(0, 0, U32_MAX),
-                                     POS(0, U64_MAX),
-                                     0, NULL);
-       BUG_ON(ret);
-}
-
-/* unit tests */
-
-static int test_delete(struct bch_fs *c, u64 nr)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       struct bkey_i_cookie k;
-       int ret;
-
-       bkey_cookie_init(&k.k_i);
-       k.k.p.snapshot = U32_MAX;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
-                            BTREE_ITER_intent);
-
-       ret = commit_do(trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(trans, &iter) ?:
-               bch2_trans_update(trans, &iter, &k.k_i, 0));
-       bch_err_msg(c, ret, "update error");
-       if (ret)
-               goto err;
-
-       pr_info("deleting once");
-       ret = commit_do(trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(trans, &iter) ?:
-               bch2_btree_delete_at(trans, &iter, 0));
-       bch_err_msg(c, ret, "delete error (first)");
-       if (ret)
-               goto err;
-
-       pr_info("deleting twice");
-       ret = commit_do(trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(trans, &iter) ?:
-               bch2_btree_delete_at(trans, &iter, 0));
-       bch_err_msg(c, ret, "delete error (second)");
-       if (ret)
-               goto err;
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-       return ret;
-}
-
-static int test_delete_written(struct bch_fs *c, u64 nr)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       struct bkey_i_cookie k;
-       int ret;
-
-       bkey_cookie_init(&k.k_i);
-       k.k.p.snapshot = U32_MAX;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
-                            BTREE_ITER_intent);
-
-       ret = commit_do(trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(trans, &iter) ?:
-               bch2_trans_update(trans, &iter, &k.k_i, 0));
-       bch_err_msg(c, ret, "update error");
-       if (ret)
-               goto err;
-
-       bch2_trans_unlock(trans);
-       bch2_journal_flush_all_pins(&c->journal);
-
-       ret = commit_do(trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(trans, &iter) ?:
-               bch2_btree_delete_at(trans, &iter, 0));
-       bch_err_msg(c, ret, "delete error");
-       if (ret)
-               goto err;
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-       return ret;
-}
-
-static int test_iterate(struct bch_fs *c, u64 nr)
-{
-       u64 i;
-       int ret = 0;
-
-       delete_test_keys(c);
-
-       pr_info("inserting test keys");
-
-       for (i = 0; i < nr; i++) {
-               struct bkey_i_cookie ck;
-
-               bkey_cookie_init(&ck.k_i);
-               ck.k.p.offset = i;
-               ck.k.p.snapshot = U32_MAX;
-
-               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
-               bch_err_msg(c, ret, "insert error");
-               if (ret)
-                       return ret;
-       }
-
-       pr_info("iterating forwards");
-       i = 0;
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
-                                       SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-                                       0, k, ({
-                       BUG_ON(k.k->p.offset != i++);
-                       0;
-               })));
-       bch_err_msg(c, ret, "error iterating forwards");
-       if (ret)
-               return ret;
-
-       BUG_ON(i != nr);
-
-       pr_info("iterating backwards");
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
-                               SPOS(0, U64_MAX, U32_MAX), 0, k, ({
-                       BUG_ON(k.k->p.offset != --i);
-                       0;
-               })));
-       bch_err_msg(c, ret, "error iterating backwards");
-       if (ret)
-               return ret;
-
-       BUG_ON(i);
-       return 0;
-}
-
-static int test_iterate_extents(struct bch_fs *c, u64 nr)
-{
-       u64 i;
-       int ret = 0;
-
-       delete_test_keys(c);
-
-       pr_info("inserting test extents");
-
-       for (i = 0; i < nr; i += 8) {
-               struct bkey_i_cookie ck;
-
-               bkey_cookie_init(&ck.k_i);
-               ck.k.p.offset = i + 8;
-               ck.k.p.snapshot = U32_MAX;
-               ck.k.size = 8;
-
-               ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
-               bch_err_msg(c, ret, "insert error");
-               if (ret)
-                       return ret;
-       }
-
-       pr_info("iterating forwards");
-       i = 0;
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_max(trans, iter, BTREE_ID_extents,
-                                       SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-                                       0, k, ({
-                       BUG_ON(bkey_start_offset(k.k) != i);
-                       i = k.k->p.offset;
-                       0;
-               })));
-       bch_err_msg(c, ret, "error iterating forwards");
-       if (ret)
-               return ret;
-
-       BUG_ON(i != nr);
-
-       pr_info("iterating backwards");
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
-                               SPOS(0, U64_MAX, U32_MAX), 0, k, ({
-                       BUG_ON(k.k->p.offset != i);
-                       i = bkey_start_offset(k.k);
-                       0;
-               })));
-       bch_err_msg(c, ret, "error iterating backwards");
-       if (ret)
-               return ret;
-
-       BUG_ON(i);
-       return 0;
-}
-
-static int test_iterate_slots(struct bch_fs *c, u64 nr)
-{
-       u64 i;
-       int ret = 0;
-
-       delete_test_keys(c);
-
-       pr_info("inserting test keys");
-
-       for (i = 0; i < nr; i++) {
-               struct bkey_i_cookie ck;
-
-               bkey_cookie_init(&ck.k_i);
-               ck.k.p.offset = i * 2;
-               ck.k.p.snapshot = U32_MAX;
-
-               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
-               bch_err_msg(c, ret, "insert error");
-               if (ret)
-                       return ret;
-       }
-
-       pr_info("iterating forwards");
-       i = 0;
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
-                                         SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-                                         0, k, ({
-                       BUG_ON(k.k->p.offset != i);
-                       i += 2;
-                       0;
-               })));
-       bch_err_msg(c, ret, "error iterating forwards");
-       if (ret)
-               return ret;
-
-       BUG_ON(i != nr * 2);
-
-       pr_info("iterating forwards by slots");
-       i = 0;
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
-                                       SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-                                       BTREE_ITER_slots, k, ({
-                       if (i >= nr * 2)
-                               break;
-
-                       BUG_ON(k.k->p.offset != i);
-                       BUG_ON(bkey_deleted(k.k) != (i & 1));
-
-                       i++;
-                       0;
-               })));
-       bch_err_msg(c, ret, "error iterating forwards by slots");
-       return ret;
-}
-
-static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
-{
-       u64 i;
-       int ret = 0;
-
-       delete_test_keys(c);
-
-       pr_info("inserting test keys");
-
-       for (i = 0; i < nr; i += 16) {
-               struct bkey_i_cookie ck;
-
-               bkey_cookie_init(&ck.k_i);
-               ck.k.p.offset = i + 16;
-               ck.k.p.snapshot = U32_MAX;
-               ck.k.size = 8;
-
-               ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
-               bch_err_msg(c, ret, "insert error");
-               if (ret)
-                       return ret;
-       }
-
-       pr_info("iterating forwards");
-       i = 0;
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_max(trans, iter, BTREE_ID_extents,
-                                       SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-                                       0, k, ({
-                       BUG_ON(bkey_start_offset(k.k) != i + 8);
-                       BUG_ON(k.k->size != 8);
-                       i += 16;
-                       0;
-               })));
-       bch_err_msg(c, ret, "error iterating forwards");
-       if (ret)
-               return ret;
-
-       BUG_ON(i != nr);
-
-       pr_info("iterating forwards by slots");
-       i = 0;
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_max(trans, iter, BTREE_ID_extents,
-                                       SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-                                       BTREE_ITER_slots, k, ({
-                       if (i == nr)
-                               break;
-                       BUG_ON(bkey_deleted(k.k) != !(i % 16));
-
-                       BUG_ON(bkey_start_offset(k.k) != i);
-                       BUG_ON(k.k->size != 8);
-                       i = k.k->p.offset;
-                       0;
-               })));
-       bch_err_msg(c, ret, "error iterating forwards by slots");
-       return ret;
-}
-
-/*
- * XXX: we really want to make sure we've got a btree with depth > 0 for these
- * tests
- */
-static int test_peek_end(struct bch_fs *c, u64 nr)
-{
-       delete_test_keys(c);
-
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       struct bkey_s_c k;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
-                            SPOS(0, 0, U32_MAX), 0);
-
-       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-       BUG_ON(k.k);
-
-       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-       BUG_ON(k.k);
-
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-       return 0;
-}
-
-static int test_peek_end_extents(struct bch_fs *c, u64 nr)
-{
-       delete_test_keys(c);
-
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       struct bkey_s_c k;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            SPOS(0, 0, U32_MAX), 0);
-
-       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-       BUG_ON(k.k);
-
-       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-       BUG_ON(k.k);
-
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-       return 0;
-}
-
-/* extent unit tests */
-
-static u64 test_version;
-
-static int insert_test_extent(struct bch_fs *c,
-                             u64 start, u64 end)
-{
-       struct bkey_i_cookie k;
-       int ret;
-
-       bkey_cookie_init(&k.k_i);
-       k.k_i.k.p.offset = end;
-       k.k_i.k.p.snapshot = U32_MAX;
-       k.k_i.k.size = end - start;
-       k.k_i.k.bversion.lo = test_version++;
-
-       ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int __test_extent_overwrite(struct bch_fs *c,
-                                   u64 e1_start, u64 e1_end,
-                                   u64 e2_start, u64 e2_end)
-{
-       int ret;
-
-       ret   = insert_test_extent(c, e1_start, e1_end) ?:
-               insert_test_extent(c, e2_start, e2_end);
-
-       delete_test_keys(c);
-       return ret;
-}
-
-static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
-{
-       return  __test_extent_overwrite(c, 0, 64, 0, 32) ?:
-               __test_extent_overwrite(c, 8, 64, 0, 32);
-}
-
-static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
-{
-       return  __test_extent_overwrite(c, 0, 64, 32, 64) ?:
-               __test_extent_overwrite(c, 0, 64, 32, 72);
-}
-
-static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
-{
-       return __test_extent_overwrite(c, 0, 64, 32, 40);
-}
-
-static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
-{
-       return  __test_extent_overwrite(c, 32, 64,  0,  64) ?:
-               __test_extent_overwrite(c, 32, 64,  0, 128) ?:
-               __test_extent_overwrite(c, 32, 64, 32,  64) ?:
-               __test_extent_overwrite(c, 32, 64, 32, 128);
-}
-
-static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid)
-{
-       struct bkey_i_cookie k;
-       int ret;
-
-       bkey_cookie_init(&k.k_i);
-       k.k_i.k.p.inode = inum;
-       k.k_i.k.p.offset = start + len;
-       k.k_i.k.p.snapshot = snapid;
-       k.k_i.k.size = len;
-
-       ret = bch2_trans_commit_do(c, NULL, NULL, 0,
-               bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
-                                           BTREE_UPDATE_internal_snapshot_node));
-       bch_err_fn(c, ret);
-       return ret;
-}
-
-static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
-{
-       return  insert_test_overlapping_extent(c, inum,  0, 16, U32_MAX - 2) ?: /* overwrite entire */
-               insert_test_overlapping_extent(c, inum,  2,  8, U32_MAX - 2) ?:
-               insert_test_overlapping_extent(c, inum,  4,  4, U32_MAX) ?:
-               insert_test_overlapping_extent(c, inum, 32,  8, U32_MAX - 2) ?: /* overwrite front/back */
-               insert_test_overlapping_extent(c, inum, 36,  8, U32_MAX) ?:
-               insert_test_overlapping_extent(c, inum, 60,  8, U32_MAX - 2) ?:
-               insert_test_overlapping_extent(c, inum, 64,  8, U32_MAX);
-}
-
-/* snapshot unit tests */
-
-/* Test skipping over keys in unrelated snapshots: */
-static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
-{
-       struct btree_trans *trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_i_cookie cookie;
-       int ret;
-
-       bkey_cookie_init(&cookie.k_i);
-       cookie.k.p.snapshot = snapid_hi;
-       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
-       if (ret)
-               return ret;
-
-       trans = bch2_trans_get(c);
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
-                            SPOS(0, 0, snapid_lo), 0);
-       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-
-       BUG_ON(k.k->p.snapshot != U32_MAX);
-
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-       return ret;
-}
-
-static int test_snapshots(struct bch_fs *c, u64 nr)
-{
-       struct bkey_i_cookie cookie;
-       u32 snapids[2];
-       u32 snapid_subvols[2] = { 1, 1 };
-       int ret;
-
-       bkey_cookie_init(&cookie.k_i);
-       cookie.k.p.snapshot = U32_MAX;
-       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
-       if (ret)
-               return ret;
-
-       ret = bch2_trans_commit_do(c, NULL, NULL, 0,
-                     bch2_snapshot_node_create(trans, U32_MAX,
-                                               snapids,
-                                               snapid_subvols,
-                                               2));
-       if (ret)
-               return ret;
-
-       if (snapids[0] > snapids[1])
-               swap(snapids[0], snapids[1]);
-
-       ret = test_snapshot_filter(c, snapids[0], snapids[1]);
-       bch_err_msg(c, ret, "from test_snapshot_filter");
-       return ret;
-}
-
-/* perf tests */
-
-static u64 test_rand(void)
-{
-       u64 v;
-
-       get_random_bytes(&v, sizeof(v));
-       return v;
-}
-
-static int rand_insert(struct bch_fs *c, u64 nr)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct bkey_i_cookie k;
-       int ret = 0;
-       u64 i;
-
-       for (i = 0; i < nr; i++) {
-               bkey_cookie_init(&k.k_i);
-               k.k.p.offset = test_rand();
-               k.k.p.snapshot = U32_MAX;
-
-               ret = commit_do(trans, NULL, NULL, 0,
-                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
-               if (ret)
-                       break;
-       }
-
-       bch2_trans_put(trans);
-       return ret;
-}
-
-static int rand_insert_multi(struct bch_fs *c, u64 nr)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct bkey_i_cookie k[8];
-       int ret = 0;
-       unsigned j;
-       u64 i;
-
-       for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
-               for (j = 0; j < ARRAY_SIZE(k); j++) {
-                       bkey_cookie_init(&k[j].k_i);
-                       k[j].k.p.offset = test_rand();
-                       k[j].k.p.snapshot = U32_MAX;
-               }
-
-               ret = commit_do(trans, NULL, NULL, 0,
-                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
-                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
-                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
-                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
-                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
-                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
-                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
-                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
-               if (ret)
-                       break;
-       }
-
-       bch2_trans_put(trans);
-       return ret;
-}
-
-static int rand_lookup(struct bch_fs *c, u64 nr)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-       u64 i;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
-                            SPOS(0, 0, U32_MAX), 0);
-
-       for (i = 0; i < nr; i++) {
-               bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX));
-
-               lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-       return ret;
-}
-
-static int rand_mixed_trans(struct btree_trans *trans,
-                           struct btree_iter *iter,
-                           struct bkey_i_cookie *cookie,
-                           u64 i, u64 pos)
-{
-       struct bkey_s_c k;
-       int ret;
-
-       bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX));
-
-       k = bch2_btree_iter_peek(trans, iter);
-       ret = bkey_err(k);
-       bch_err_msg(trans->c, ret, "lookup error");
-       if (ret)
-               return ret;
-
-       if (!(i & 3) && k.k) {
-               bkey_cookie_init(&cookie->k_i);
-               cookie->k.p = iter->pos;
-               ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
-       }
-
-       return ret;
-}
-
-static int rand_mixed(struct bch_fs *c, u64 nr)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       struct bkey_i_cookie cookie;
-       int ret = 0;
-       u64 i, rand;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
-                            SPOS(0, 0, U32_MAX), 0);
-
-       for (i = 0; i < nr; i++) {
-               rand = test_rand();
-               ret = commit_do(trans, NULL, NULL, 0,
-                       rand_mixed_trans(trans, &iter, &cookie, i, rand));
-               if (ret)
-                       break;
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
-       return ret;
-}
-
-static int __do_delete(struct btree_trans *trans, struct bpos pos)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
-                            BTREE_ITER_intent);
-       k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX));
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       if (!k.k)
-               goto err;
-
-       ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int rand_delete(struct bch_fs *c, u64 nr)
-{
-       struct btree_trans *trans = bch2_trans_get(c);
-       int ret = 0;
-       u64 i;
-
-       for (i = 0; i < nr; i++) {
-               struct bpos pos = SPOS(0, test_rand(), U32_MAX);
-
-               ret = commit_do(trans, NULL, NULL, 0,
-                       __do_delete(trans, pos));
-               if (ret)
-                       break;
-       }
-
-       bch2_trans_put(trans);
-       return ret;
-}
-
-static int seq_insert(struct bch_fs *c, u64 nr)
-{
-       struct bkey_i_cookie insert;
-
-       bkey_cookie_init(&insert.k_i);
-
-       return bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
-                                       SPOS(0, 0, U32_MAX),
-                                       BTREE_ITER_slots|BTREE_ITER_intent, k,
-                                       NULL, NULL, 0, ({
-                       if (iter.pos.offset >= nr)
-                               break;
-                       insert.k.p = iter.pos;
-                       bch2_trans_update(trans, &iter, &insert.k_i, 0);
-               })));
-}
-
-static int seq_lookup(struct bch_fs *c, u64 nr)
-{
-       return bch2_trans_run(c,
-               for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
-                                 SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-                                 0, k,
-               0));
-}
-
-static int seq_overwrite(struct bch_fs *c, u64 nr)
-{
-       return bch2_trans_run(c,
-               for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
-                                       SPOS(0, 0, U32_MAX),
-                                       BTREE_ITER_intent, k,
-                                       NULL, NULL, 0, ({
-                       struct bkey_i_cookie u;
-
-                       bkey_reassemble(&u.k_i, k);
-                       bch2_trans_update(trans, &iter, &u.k_i, 0);
-               })));
-}
-
-static int seq_delete(struct bch_fs *c, u64 nr)
-{
-       return bch2_btree_delete_range(c, BTREE_ID_xattrs,
-                                     SPOS(0, 0, U32_MAX),
-                                     POS(0, U64_MAX),
-                                     0, NULL);
-}
-
-typedef int (*perf_test_fn)(struct bch_fs *, u64);
-
-struct test_job {
-       struct bch_fs                   *c;
-       u64                             nr;
-       unsigned                        nr_threads;
-       perf_test_fn                    fn;
-
-       atomic_t                        ready;
-       wait_queue_head_t               ready_wait;
-
-       atomic_t                        done;
-       struct completion               done_completion;
-
-       u64                             start;
-       u64                             finish;
-       int                             ret;
-};
-
-static int btree_perf_test_thread(void *data)
-{
-       struct test_job *j = data;
-       int ret;
-
-       if (atomic_dec_and_test(&j->ready)) {
-               wake_up(&j->ready_wait);
-               j->start = sched_clock();
-       } else {
-               wait_event(j->ready_wait, !atomic_read(&j->ready));
-       }
-
-       ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
-       if (ret) {
-               bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
-               j->ret = ret;
-       }
-
-       if (atomic_dec_and_test(&j->done)) {
-               j->finish = sched_clock();
-               complete(&j->done_completion);
-       }
-
-       return 0;
-}
-
-int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
-                        u64 nr, unsigned nr_threads)
-{
-       struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
-       char name_buf[20];
-       struct printbuf nr_buf = PRINTBUF;
-       struct printbuf per_sec_buf = PRINTBUF;
-       unsigned i;
-       u64 time;
-
-       if (nr == 0 || nr_threads == 0) {
-               pr_err("nr of iterations or threads is not allowed to be 0");
-               return -EINVAL;
-       }
-
-       atomic_set(&j.ready, nr_threads);
-       init_waitqueue_head(&j.ready_wait);
-
-       atomic_set(&j.done, nr_threads);
-       init_completion(&j.done_completion);
-
-#define perf_test(_test)                               \
-       if (!strcmp(testname, #_test)) j.fn = _test
-
-       perf_test(rand_insert);
-       perf_test(rand_insert_multi);
-       perf_test(rand_lookup);
-       perf_test(rand_mixed);
-       perf_test(rand_delete);
-
-       perf_test(seq_insert);
-       perf_test(seq_lookup);
-       perf_test(seq_overwrite);
-       perf_test(seq_delete);
-
-       /* a unit test, not a perf test: */
-       perf_test(test_delete);
-       perf_test(test_delete_written);
-       perf_test(test_iterate);
-       perf_test(test_iterate_extents);
-       perf_test(test_iterate_slots);
-       perf_test(test_iterate_slots_extents);
-       perf_test(test_peek_end);
-       perf_test(test_peek_end_extents);
-
-       perf_test(test_extent_overwrite_front);
-       perf_test(test_extent_overwrite_back);
-       perf_test(test_extent_overwrite_middle);
-       perf_test(test_extent_overwrite_all);
-       perf_test(test_extent_create_overlapping);
-
-       perf_test(test_snapshots);
-
-       if (!j.fn) {
-               pr_err("unknown test %s", testname);
-               return -EINVAL;
-       }
-
-       //pr_info("running test %s:", testname);
-
-       if (nr_threads == 1)
-               btree_perf_test_thread(&j);
-       else
-               for (i = 0; i < nr_threads; i++)
-                       kthread_run(btree_perf_test_thread, &j,
-                                   "bcachefs perf test[%u]", i);
-
-       while (wait_for_completion_interruptible(&j.done_completion))
-               ;
-
-       time = j.finish - j.start;
-
-       scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
-       prt_human_readable_u64(&nr_buf, nr);
-       prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
-       printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
-               name_buf, nr_buf.buf, nr_threads,
-               div_u64(time, NSEC_PER_SEC),
-               div_u64(time * nr_threads, nr),
-               per_sec_buf.buf);
-       printbuf_exit(&per_sec_buf);
-       printbuf_exit(&nr_buf);
-       return j.ret;
-}
-
-#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
deleted file mode 100644 (file)
index c73b18a..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_TEST_H
-#define _BCACHEFS_TEST_H
-
-struct bch_fs;
-
-#ifdef CONFIG_BCACHEFS_TESTS
-
-int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
-
-#else
-
-#endif /* CONFIG_BCACHEFS_TESTS */
-
-#endif /* _BCACHEFS_TEST_H */
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
deleted file mode 100644 (file)
index 314a24d..0000000
+++ /dev/null
@@ -1,494 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "thread_with_file.h"
-
-#include <linux/anon_inodes.h>
-#include <linux/file.h>
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/poll.h>
-#include <linux/sched/sysctl.h>
-
-void bch2_thread_with_file_exit(struct thread_with_file *thr)
-{
-       if (thr->task) {
-               kthread_stop(thr->task);
-               put_task_struct(thr->task);
-       }
-}
-
-int bch2_run_thread_with_file(struct thread_with_file *thr,
-                             const struct file_operations *fops,
-                             int (*fn)(void *))
-{
-       struct file *file = NULL;
-       int ret, fd = -1;
-       unsigned fd_flags = O_CLOEXEC;
-
-       if (fops->read && fops->write)
-               fd_flags |= O_RDWR;
-       else if (fops->read)
-               fd_flags |= O_RDONLY;
-       else if (fops->write)
-               fd_flags |= O_WRONLY;
-
-       char name[TASK_COMM_LEN];
-       get_task_comm(name, current);
-
-       thr->ret = 0;
-       thr->task = kthread_create(fn, thr, "%s", name);
-       ret = PTR_ERR_OR_ZERO(thr->task);
-       if (ret)
-               return ret;
-
-       ret = get_unused_fd_flags(fd_flags);
-       if (ret < 0)
-               goto err;
-       fd = ret;
-
-       file = anon_inode_getfile(name, fops, thr, fd_flags);
-       ret = PTR_ERR_OR_ZERO(file);
-       if (ret)
-               goto err;
-
-       get_task_struct(thr->task);
-       wake_up_process(thr->task);
-       fd_install(fd, file);
-       return fd;
-err:
-       if (fd >= 0)
-               put_unused_fd(fd);
-       if (thr->task)
-               kthread_stop(thr->task);
-       return ret;
-}
-
-/* stdio_redirect */
-
-static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen)
-{
-       return stdio->input.buf.nr > seen || stdio->done;
-}
-
-static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
-{
-       return stdio_redirect_has_more_input(stdio, 0);
-}
-
-static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
-{
-       return stdio->output.buf.nr || stdio->done;
-}
-
-#define STDIO_REDIRECT_BUFSIZE         4096
-
-static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
-{
-       return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
-}
-
-static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
-{
-       return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
-}
-
-static void stdio_buf_init(struct stdio_buf *buf)
-{
-       spin_lock_init(&buf->lock);
-       init_waitqueue_head(&buf->wait);
-       darray_init(&buf->buf);
-}
-
-/* thread_with_stdio */
-
-static void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
-       thr->thr.done = true;
-       thr->stdio.done = true;
-       wake_up(&thr->stdio.input.wait);
-       wake_up(&thr->stdio.output.wait);
-}
-
-static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
-                                     size_t len, loff_t *ppos)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-       struct stdio_buf *buf = &thr->stdio.output;
-       size_t copied = 0, b;
-       int ret = 0;
-
-       if (!(file->f_flags & O_NONBLOCK)) {
-               ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
-               if (ret)
-                       return ret;
-       } else if (!stdio_redirect_has_output(&thr->stdio))
-               return -EAGAIN;
-
-       while (len && buf->buf.nr) {
-               if (fault_in_writeable(ubuf, len) == len) {
-                       ret = -EFAULT;
-                       break;
-               }
-
-               spin_lock_irq(&buf->lock);
-               b = min_t(size_t, len, buf->buf.nr);
-
-               if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
-                       ubuf    += b;
-                       len     -= b;
-                       copied  += b;
-                       buf->buf.nr -= b;
-                       memmove(buf->buf.data,
-                               buf->buf.data + b,
-                               buf->buf.nr);
-               }
-               spin_unlock_irq(&buf->lock);
-       }
-
-       return copied ?: ret;
-}
-
-static int thread_with_stdio_release(struct inode *inode, struct file *file)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-
-       thread_with_stdio_done(thr);
-       bch2_thread_with_file_exit(&thr->thr);
-       darray_exit(&thr->stdio.input.buf);
-       darray_exit(&thr->stdio.output.buf);
-       thr->ops->exit(thr);
-       return 0;
-}
-
-static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
-                                      size_t len, loff_t *ppos)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-       struct stdio_buf *buf = &thr->stdio.input;
-       size_t copied = 0;
-       ssize_t ret = 0;
-
-       while (len) {
-               if (thr->thr.done) {
-                       ret = -EPIPE;
-                       break;
-               }
-
-               size_t b = len - fault_in_readable(ubuf, len);
-               if (!b) {
-                       ret = -EFAULT;
-                       break;
-               }
-
-               spin_lock(&buf->lock);
-               size_t makeroom = b;
-               if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr))
-                       makeroom = min_t(ssize_t, makeroom,
-                                  max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr,
-                                                 0));
-               darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT);
-
-               b = min(len, darray_room(buf->buf));
-
-               if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
-                       buf->buf.nr += b;
-                       ubuf    += b;
-                       len     -= b;
-                       copied  += b;
-               }
-               spin_unlock(&buf->lock);
-
-               if (b) {
-                       wake_up(&buf->wait);
-               } else {
-                       if ((file->f_flags & O_NONBLOCK)) {
-                               ret = -EAGAIN;
-                               break;
-                       }
-
-                       ret = wait_event_interruptible(buf->wait,
-                                       stdio_redirect_has_input_space(&thr->stdio));
-                       if (ret)
-                               break;
-               }
-       }
-
-       return copied ?: ret;
-}
-
-static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-
-       poll_wait(file, &thr->stdio.output.wait, wait);
-       poll_wait(file, &thr->stdio.input.wait, wait);
-
-       __poll_t mask = 0;
-
-       if (stdio_redirect_has_output(&thr->stdio))
-               mask |= EPOLLIN;
-       if (stdio_redirect_has_input_space(&thr->stdio))
-               mask |= EPOLLOUT;
-       if (thr->thr.done)
-               mask |= EPOLLHUP|EPOLLERR;
-       return mask;
-}
-
-static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-
-       poll_wait(file, &thr->stdio.output.wait, wait);
-
-       __poll_t mask = 0;
-
-       if (stdio_redirect_has_output(&thr->stdio))
-               mask |= EPOLLIN;
-       if (thr->thr.done)
-               mask |= EPOLLHUP|EPOLLERR;
-       return mask;
-}
-
-static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-
-       return thr->thr.ret;
-}
-
-static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
-{
-       struct thread_with_stdio *thr =
-               container_of(file->private_data, struct thread_with_stdio, thr);
-
-       if (thr->ops->unlocked_ioctl)
-               return thr->ops->unlocked_ioctl(thr, cmd, p);
-       return -ENOTTY;
-}
-
-static const struct file_operations thread_with_stdio_fops = {
-       .read           = thread_with_stdio_read,
-       .write          = thread_with_stdio_write,
-       .poll           = thread_with_stdio_poll,
-       .flush          = thread_with_stdio_flush,
-       .release        = thread_with_stdio_release,
-       .unlocked_ioctl = thread_with_stdio_ioctl,
-};
-
-static const struct file_operations thread_with_stdout_fops = {
-       .read           = thread_with_stdio_read,
-       .poll           = thread_with_stdout_poll,
-       .flush          = thread_with_stdio_flush,
-       .release        = thread_with_stdio_release,
-       .unlocked_ioctl = thread_with_stdio_ioctl,
-};
-
-static int thread_with_stdio_fn(void *arg)
-{
-       struct thread_with_stdio *thr = arg;
-
-       thr->thr.ret = thr->ops->fn(thr);
-
-       thread_with_stdio_done(thr);
-       return 0;
-}
-
-void bch2_thread_with_stdio_init(struct thread_with_stdio *thr,
-                                const struct thread_with_stdio_ops *ops)
-{
-       stdio_buf_init(&thr->stdio.input);
-       stdio_buf_init(&thr->stdio.output);
-       thr->ops = ops;
-}
-
-int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr)
-{
-       return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
-}
-
-int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
-                              const struct thread_with_stdio_ops *ops)
-{
-       bch2_thread_with_stdio_init(thr, ops);
-
-       return __bch2_run_thread_with_stdio(thr);
-}
-
-int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
-                               const struct thread_with_stdio_ops *ops)
-{
-       stdio_buf_init(&thr->stdio.input);
-       stdio_buf_init(&thr->stdio.output);
-       thr->ops = ops;
-
-       return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
-}
-EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
-
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
-{
-       struct stdio_buf *buf = &stdio->input;
-
-       /*
-        * we're waiting on user input (or for the file descriptor to be
-        * closed), don't want a hung task warning:
-        */
-       do {
-               wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
-                                  sysctl_hung_task_timeout_secs * HZ / 2);
-       } while (!stdio_redirect_has_input(stdio));
-
-       if (stdio->done)
-               return -1;
-
-       spin_lock(&buf->lock);
-       int ret = min(len, buf->buf.nr);
-       buf->buf.nr -= ret;
-       memcpy(ubuf, buf->buf.data, ret);
-       memmove(buf->buf.data,
-               buf->buf.data + ret,
-               buf->buf.nr);
-       spin_unlock(&buf->lock);
-
-       wake_up(&buf->wait);
-       return ret;
-}
-
-int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio,
-                                        darray_char *line,
-                                        unsigned long timeout)
-{
-       unsigned long until = jiffies + timeout, t;
-       struct stdio_buf *buf = &stdio->input;
-       size_t seen = 0;
-again:
-       t = timeout != MAX_SCHEDULE_TIMEOUT
-               ? max_t(long, until - jiffies, 0)
-               : timeout;
-
-       t = min(t, sysctl_hung_task_timeout_secs * HZ / 2);
-
-       wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t);
-
-       if (stdio->done)
-               return -1;
-
-       spin_lock(&buf->lock);
-       seen = buf->buf.nr;
-       char *n = memchr(buf->buf.data, '\n', seen);
-
-       if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
-               spin_unlock(&buf->lock);
-               return -ETIME;
-       }
-
-       if (!n) {
-               buf->waiting_for_line = true;
-               spin_unlock(&buf->lock);
-               goto again;
-       }
-
-       size_t b = n + 1 - buf->buf.data;
-       if (b > line->size) {
-               spin_unlock(&buf->lock);
-               int ret = darray_resize(line, b);
-               if (ret)
-                       return ret;
-               seen = 0;
-               goto again;
-       }
-
-       buf->buf.nr -= b;
-       memcpy(line->data, buf->buf.data, b);
-       memmove(buf->buf.data,
-               buf->buf.data + b,
-               buf->buf.nr);
-       line->nr = b;
-
-       buf->waiting_for_line = false;
-       spin_unlock(&buf->lock);
-
-       wake_up(&buf->wait);
-       return 0;
-}
-
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
-{
-       return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT);
-}
-
-__printf(3, 0)
-static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
-{
-       ssize_t ret;
-
-       do {
-               va_list args2;
-               size_t len;
-
-               va_copy(args2, args);
-               len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
-               va_end(args2);
-
-               if (len + 1 <= darray_room(*out)) {
-                       out->nr += len;
-                       return len;
-               }
-
-               ret = darray_make_room_gfp(out, len + 1, gfp);
-       } while (ret == 0);
-
-       return ret;
-}
-
-ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
-                                   const char *fmt, va_list args)
-{
-       struct stdio_buf *buf = &stdio->output;
-       unsigned long flags;
-       ssize_t ret;
-again:
-       if (stdio->done)
-               return -EPIPE;
-
-       spin_lock_irqsave(&buf->lock, flags);
-       ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
-       spin_unlock_irqrestore(&buf->lock, flags);
-
-       if (ret < 0) {
-               if (nonblocking)
-                       return -EAGAIN;
-
-               ret = wait_event_interruptible(buf->wait,
-                               stdio_redirect_has_output_space(stdio));
-               if (ret)
-                       return ret;
-               goto again;
-       }
-
-       wake_up(&buf->wait);
-       return ret;
-}
-
-ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
-                               const char *fmt, ...)
-{
-       va_list args;
-       ssize_t ret;
-
-       va_start(args, fmt);
-       ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
-       va_end(args);
-
-       return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
deleted file mode 100644 (file)
index 72497b9..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_H
-#define _BCACHEFS_THREAD_WITH_FILE_H
-
-#include "thread_with_file_types.h"
-
-/*
- * Thread with file: Run a kthread and connect it to a file descriptor, so that
- * it can be interacted with via fd read/write methods and closing the file
- * descriptor stops the kthread.
- *
- * We have two different APIs:
- *
- * thread_with_file, the low level version.
- * You get to define the full file_operations, including your release function,
- * which means that you must call bch2_thread_with_file_exit() from your
- * .release method
- *
- * thread_with_stdio, the higher level version
- * This implements full piping of input and output, including .poll.
- *
- * Notes on behaviour:
- *  - kthread shutdown behaves like writing or reading from a pipe that has been
- *    closed
- *  - Input and output buffers are 4096 bytes, although buffers may in some
- *    situations slightly exceed that limit so as to avoid chopping off a
- *    message in the middle in nonblocking mode.
- *  - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
- *    should be fine but might change in future revisions.
- *  - Output buffer may grow past 4096 bytes to deal with messages that are
- *    bigger than 4096 bytes
- *  - Writing may be done blocking or nonblocking; in nonblocking mode, we only
- *    drop entire messages.
- *
- * To write, use stdio_redirect_printf()
- * To read, use stdio_redirect_read() or stdio_redirect_readline()
- */
-
-struct task_struct;
-
-struct thread_with_file {
-       struct task_struct      *task;
-       int                     ret;
-       bool                    done;
-};
-
-void bch2_thread_with_file_exit(struct thread_with_file *);
-int bch2_run_thread_with_file(struct thread_with_file *,
-                             const struct file_operations *,
-                             int (*fn)(void *));
-
-struct thread_with_stdio;
-
-struct thread_with_stdio_ops {
-       void (*exit)(struct thread_with_stdio *);
-       int (*fn)(struct thread_with_stdio *);
-       long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
-};
-
-struct thread_with_stdio {
-       struct thread_with_file thr;
-       struct stdio_redirect   stdio;
-       const struct thread_with_stdio_ops      *ops;
-};
-
-void bch2_thread_with_stdio_init(struct thread_with_stdio *,
-                                const struct thread_with_stdio_ops *);
-int __bch2_run_thread_with_stdio(struct thread_with_stdio *);
-int bch2_run_thread_with_stdio(struct thread_with_stdio *,
-                              const struct thread_with_stdio_ops *);
-int bch2_run_thread_with_stdout(struct thread_with_stdio *,
-                               const struct thread_with_stdio_ops *);
-int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
-
-int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long);
-int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *);
-
-__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
-__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
deleted file mode 100644 (file)
index f4d484d..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-
-#include "darray.h"
-
-struct stdio_buf {
-       spinlock_t              lock;
-       wait_queue_head_t       wait;
-       darray_char             buf;
-       bool                    waiting_for_line;
-};
-
-struct stdio_redirect {
-       struct stdio_buf        input;
-       struct stdio_buf        output;
-       bool                    done;
-};
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
deleted file mode 100644 (file)
index 2c34fe4..0000000
+++ /dev/null
@@ -1,191 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/time.h>
-#include <linux/spinlock.h>
-
-#include "eytzinger.h"
-#include "time_stats.h"
-
-/* disable automatic switching to percpu mode */
-#define TIME_STATS_NONPCPU     ((unsigned long) 1)
-
-static const struct time_unit time_units[] = {
-       { "ns",         1                },
-       { "us",         NSEC_PER_USEC    },
-       { "ms",         NSEC_PER_MSEC    },
-       { "s",          NSEC_PER_SEC     },
-       { "m",          (u64) NSEC_PER_SEC * 60},
-       { "h",          (u64) NSEC_PER_SEC * 3600},
-       { "d",          (u64) NSEC_PER_SEC * 3600 * 24},
-       { "w",          (u64) NSEC_PER_SEC * 3600 * 24 * 7},
-       { "y",          (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
-       { "eon",        U64_MAX          },
-};
-
-const struct time_unit *bch2_pick_time_units(u64 ns)
-{
-       const struct time_unit *u;
-
-       for (u = time_units;
-            u + 1 < time_units + ARRAY_SIZE(time_units) &&
-            ns >= u[1].nsecs << 1;
-            u++)
-               ;
-
-       return u;
-}
-
-static void quantiles_update(struct quantiles *q, u64 v)
-{
-       unsigned i = 0;
-
-       while (i < ARRAY_SIZE(q->entries)) {
-               struct quantile_entry *e = q->entries + i;
-
-               if (unlikely(!e->step)) {
-                       e->m = v;
-                       e->step = max_t(unsigned, v / 2, 1024);
-               } else if (e->m > v) {
-                       e->m = e->m >= e->step
-                               ? e->m - e->step
-                               : 0;
-               } else if (e->m < v) {
-                       e->m = e->m + e->step > e->m
-                               ? e->m + e->step
-                               : U32_MAX;
-               }
-
-               if ((e->m > v ? e->m - v : v - e->m) < e->step)
-                       e->step = max_t(unsigned, e->step / 2, 1);
-
-               if (v >= e->m)
-                       break;
-
-               i = eytzinger0_child(i, v > e->m);
-       }
-}
-
-static inline void time_stats_update_one(struct bch2_time_stats *stats,
-                                             u64 start, u64 end)
-{
-       u64 duration, freq;
-       bool initted = stats->last_event != 0;
-
-       if (time_after64(end, start)) {
-               struct quantiles *quantiles = time_stats_to_quantiles(stats);
-
-               duration = end - start;
-               mean_and_variance_update(&stats->duration_stats, duration);
-               mean_and_variance_weighted_update(&stats->duration_stats_weighted,
-                               duration, initted, TIME_STATS_MV_WEIGHT);
-               stats->max_duration = max(stats->max_duration, duration);
-               stats->min_duration = min(stats->min_duration, duration);
-               stats->total_duration += duration;
-
-               if (quantiles)
-                       quantiles_update(quantiles, duration);
-       }
-
-       if (stats->last_event && time_after64(end, stats->last_event)) {
-               freq = end - stats->last_event;
-               mean_and_variance_update(&stats->freq_stats, freq);
-               mean_and_variance_weighted_update(&stats->freq_stats_weighted,
-                               freq, initted, TIME_STATS_MV_WEIGHT);
-               stats->max_freq = max(stats->max_freq, freq);
-               stats->min_freq = min(stats->min_freq, freq);
-       }
-
-       stats->last_event = end;
-}
-
-void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
-                                   struct time_stat_buffer *b)
-{
-       for (struct time_stat_buffer_entry *i = b->entries;
-            i < b->entries + ARRAY_SIZE(b->entries);
-            i++)
-               time_stats_update_one(stats, i->start, i->end);
-       b->nr = 0;
-}
-
-static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
-                                            struct time_stat_buffer *b)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&stats->lock, flags);
-       __bch2_time_stats_clear_buffer(stats, b);
-       spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
-       unsigned long flags;
-
-       if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) {
-               spin_lock_irqsave(&stats->lock, flags);
-               time_stats_update_one(stats, start, end);
-
-               if (!stats->buffer &&
-                   mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
-                   stats->duration_stats.n > 1024)
-                       stats->buffer =
-                               alloc_percpu_gfp(struct time_stat_buffer,
-                                                GFP_ATOMIC);
-               spin_unlock_irqrestore(&stats->lock, flags);
-       } else {
-               struct time_stat_buffer *b;
-
-               preempt_disable();
-               b = this_cpu_ptr(stats->buffer);
-
-               BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
-               b->entries[b->nr++] = (struct time_stat_buffer_entry) {
-                       .start = start,
-                       .end = end
-               };
-
-               if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
-                       time_stats_clear_buffer(stats, b);
-               preempt_enable();
-       }
-}
-
-void bch2_time_stats_reset(struct bch2_time_stats *stats)
-{
-       spin_lock_irq(&stats->lock);
-       unsigned offset = offsetof(struct bch2_time_stats, min_duration);
-       memset((void *) stats + offset, 0, sizeof(*stats) - offset);
-
-       if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) {
-               int cpu;
-               for_each_possible_cpu(cpu)
-                       per_cpu_ptr(stats->buffer, cpu)->nr = 0;
-       }
-       spin_unlock_irq(&stats->lock);
-}
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
-       if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU)
-               free_percpu(stats->buffer);
-       stats->buffer = NULL;
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
-       memset(stats, 0, sizeof(*stats));
-       stats->min_duration = U64_MAX;
-       stats->min_freq = U64_MAX;
-       spin_lock_init(&stats->lock);
-}
-
-void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats)
-{
-       bch2_time_stats_init(stats);
-       stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU;
-}
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
deleted file mode 100644 (file)
index eddb098..0000000
+++ /dev/null
@@ -1,161 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * bch2_time_stats - collect statistics on events that have a duration, with nicely
- * formatted textual output on demand
- *
- * - percpu buffering of event collection: cheap enough to shotgun
- *   everywhere without worrying about overhead
- *
- * tracks:
- *  - number of events
- *  - maximum event duration ever seen
- *  - sum of all event durations
- *  - average event duration, standard and weighted
- *  - standard deviation of event durations, standard and weighted
- * and analagous statistics for the frequency of events
- *
- * We provide both mean and weighted mean (exponentially weighted), and standard
- * deviation and weighted standard deviation, to give an efficient-to-compute
- * view of current behaviour versus. average behaviour - "did this event source
- * just become wonky, or is this typical?".
- *
- * Particularly useful for tracking down latency issues.
- */
-#ifndef _BCACHEFS_TIME_STATS_H
-#define _BCACHEFS_TIME_STATS_H
-
-#include <linux/sched/clock.h>
-#include <linux/spinlock_types.h>
-#include <linux/string.h>
-
-#include "mean_and_variance.h"
-
-struct time_unit {
-       const char      *name;
-       u64             nsecs;
-};
-
-/*
- * given a nanosecond value, pick the preferred time units for printing:
- */
-const struct time_unit *bch2_pick_time_units(u64 ns);
-
-/*
- * quantiles - do not use:
- *
- * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
- * use in new code.
- */
-
-#define NR_QUANTILES   15
-#define QUANTILE_IDX(i)        inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST  eytzinger0_last(NR_QUANTILES)
-
-struct quantiles {
-       struct quantile_entry {
-               u64     m;
-               u64     step;
-       }               entries[NR_QUANTILES];
-};
-
-struct time_stat_buffer {
-       unsigned        nr;
-       struct time_stat_buffer_entry {
-               u64     start;
-               u64     end;
-       }               entries[31];
-};
-
-struct bch2_time_stats {
-       spinlock_t      lock;
-       bool            have_quantiles;
-       struct time_stat_buffer __percpu *buffer;
-       /* all fields are in nanoseconds */
-       u64             min_duration;
-       u64             max_duration;
-       u64             total_duration;
-       u64             max_freq;
-       u64             min_freq;
-       u64             last_event;
-       u64             last_event_start;
-
-       struct mean_and_variance          duration_stats;
-       struct mean_and_variance          freq_stats;
-
-/* default weight for weighted mean and variance calculations */
-#define TIME_STATS_MV_WEIGHT   8
-
-       struct mean_and_variance_weighted duration_stats_weighted;
-       struct mean_and_variance_weighted freq_stats_weighted;
-};
-
-struct bch2_time_stats_quantiles {
-       struct bch2_time_stats  stats;
-       struct quantiles        quantiles;
-};
-
-static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
-{
-       return stats->have_quantiles
-               ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
-               : NULL;
-}
-
-void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-/**
- * time_stats_update - collect a new event being tracked
- *
- * @stats      - bch2_time_stats to update
- * @start      - start time of event, recorded with local_clock()
- *
- * The end duration of the event will be the current time
- */
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
-       __bch2_time_stats_update(stats, start, local_clock());
-}
-
-/**
- * track_event_change - track state change events
- *
- * @stats      - bch2_time_stats to update
- * @v          - new state, true or false
- *
- * Use this when tracking time stats for state changes, i.e. resource X becoming
- * blocked/unblocked.
- */
-static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
-{
-       if (v != !!stats->last_event_start) {
-               if (!v) {
-                       bch2_time_stats_update(stats, stats->last_event_start);
-                       stats->last_event_start = 0;
-               } else {
-                       stats->last_event_start = local_clock() ?: 1;
-                       return true;
-               }
-       }
-
-       return false;
-}
-
-void bch2_time_stats_reset(struct bch2_time_stats *);
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
-void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *);
-
-static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
-{
-       bch2_time_stats_exit(&statq->stats);
-}
-static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
-{
-       bch2_time_stats_init(&statq->stats);
-       statq->stats.have_quantiles = true;
-       memset(&statq->quantiles, 0, sizeof(statq->quantiles));
-}
-
-#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
deleted file mode 100644 (file)
index dfad1d0..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "buckets.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update_interior.h"
-#include "keylist.h"
-#include "move_types.h"
-#include "opts.h"
-#include "six.h"
-
-#include <linux/blktrace_api.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
deleted file mode 100644 (file)
index 9c5a9c5..0000000
+++ /dev/null
@@ -1,1883 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM bcachefs
-
-#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
-
-#include <linux/tracepoint.h>
-
-#define TRACE_BPOS_entries(name)                               \
-       __field(u64,                    name##_inode    )       \
-       __field(u64,                    name##_offset   )       \
-       __field(u32,                    name##_snapshot )
-
-#define TRACE_BPOS_assign(dst, src)                            \
-       __entry->dst##_inode            = (src).inode;          \
-       __entry->dst##_offset           = (src).offset;         \
-       __entry->dst##_snapshot         = (src).snapshot
-
-DECLARE_EVENT_CLASS(bpos,
-       TP_PROTO(const struct bpos *p),
-       TP_ARGS(p),
-
-       TP_STRUCT__entry(
-               TRACE_BPOS_entries(p)
-       ),
-
-       TP_fast_assign(
-               TRACE_BPOS_assign(p, *p);
-       ),
-
-       TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
-);
-
-DECLARE_EVENT_CLASS(fs_str,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __string(str,           str                     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               __assign_str(str);
-       ),
-
-       TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(trans_str,
-       TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
-       TP_ARGS(trans, caller_ip, str),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __array(char,           trans_fn, 32            )
-               __field(unsigned long,  caller_ip               )
-               __string(str,           str                     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = trans->c->dev;
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-               __assign_str(str);
-       ),
-
-       TP_printk("%d,%d %s %pS %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(trans_str_nocaller,
-       TP_PROTO(struct btree_trans *trans, const char *str),
-       TP_ARGS(trans, str),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __array(char,           trans_fn, 32            )
-               __string(str,           str                     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = trans->c->dev;
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __assign_str(str);
-       ),
-
-       TP_printk("%d,%d %s %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->trans_fn, __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(btree_node_nofs,
-       TP_PROTO(struct bch_fs *c, struct btree *b),
-       TP_ARGS(c, b),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u8,             level                   )
-               __field(u8,             btree_id                )
-               TRACE_BPOS_entries(pos)
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               __entry->level          = b->c.level;
-               __entry->btree_id       = b->c.btree_id;
-               TRACE_BPOS_assign(pos, b->key.k.p);
-       ),
-
-       TP_printk("%d,%d %u %s %llu:%llu:%u",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->level,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(btree_node,
-       TP_PROTO(struct btree_trans *trans, struct btree *b),
-       TP_ARGS(trans, b),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __array(char,           trans_fn, 32            )
-               __field(u8,             level                   )
-               __field(u8,             btree_id                )
-               TRACE_BPOS_entries(pos)
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = trans->c->dev;
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->level          = b->c.level;
-               __entry->btree_id       = b->c.btree_id;
-               TRACE_BPOS_assign(pos, b->key.k.p);
-       ),
-
-       TP_printk("%d,%d %s %u %s %llu:%llu:%u",
-                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn,
-                 __entry->level,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(bch_fs,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-       ),
-
-       TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
-);
-
-DECLARE_EVENT_CLASS(btree_trans,
-       TP_PROTO(struct btree_trans *trans),
-       TP_ARGS(trans),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __array(char,           trans_fn, 32            )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = trans->c->dev;
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-       ),
-
-       TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn)
-);
-
-DECLARE_EVENT_CLASS(bio,
-       TP_PROTO(struct bio *bio),
-       TP_ARGS(bio),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(sector_t,       sector                  )
-               __field(unsigned int,   nr_sector               )
-               __array(char,           rwbs,   6               )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = bio->bi_bdev ? bio_dev(bio) : 0;
-               __entry->sector         = bio->bi_iter.bi_sector;
-               __entry->nr_sector      = bio->bi_iter.bi_size >> 9;
-               blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
-       ),
-
-       TP_printk("%d,%d  %s %llu + %u",
-                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-                 (unsigned long long)__entry->sector, __entry->nr_sector)
-);
-
-/* errors */
-
-TRACE_EVENT(error_throw,
-       TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip),
-       TP_ARGS(c, bch_err, ip),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(int,            err                     )
-               __array(char,           err_str, 32             )
-               __array(char,           ip, 32                  )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               __entry->err            = bch_err;
-               strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str));
-               snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
-       ),
-
-       TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ip, __entry->err_str)
-);
-
-TRACE_EVENT(error_downcast,
-       TP_PROTO(int bch_err, int std_err, unsigned long ip),
-       TP_ARGS(bch_err, std_err, ip),
-
-       TP_STRUCT__entry(
-               __array(char,           bch_err, 32             )
-               __array(char,           std_err, 32             )
-               __array(char,           ip, 32                  )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
-               strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
-               snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
-       ),
-
-       TP_printk("%s ret %s -> %s %s", __entry->ip,
-                 __entry->bch_err, __entry->std_err, __entry->ip)
-);
-
-/* disk_accounting.c */
-
-TRACE_EVENT(accounting_mem_insert,
-       TP_PROTO(struct bch_fs *c, const char *acc),
-       TP_ARGS(c, acc),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(unsigned,       new_nr                  )
-               __string(acc,           acc                     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               __entry->new_nr         = c->accounting.k.nr;
-               __assign_str(acc);
-       ),
-
-       TP_printk("%d,%d entries %u added %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->new_nr,
-                 __get_str(acc))
-);
-
-/* fs.c: */
-TRACE_EVENT(bch2_sync_fs,
-       TP_PROTO(struct super_block *sb, int wait),
-
-       TP_ARGS(sb, wait),
-
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        int,    wait                    )
-
-       ),
-
-       TP_fast_assign(
-               __entry->dev    = sb->s_dev;
-               __entry->wait   = wait;
-       ),
-
-       TP_printk("dev %d,%d wait %d",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->wait)
-);
-
-/* fs-io.c: */
-TRACE_EVENT(bch2_fsync,
-       TP_PROTO(struct file *file, int datasync),
-
-       TP_ARGS(file, datasync),
-
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
-               __field(        ino_t,  parent                  )
-               __field(        int,    datasync                )
-       ),
-
-       TP_fast_assign(
-               struct dentry *dentry = file->f_path.dentry;
-
-               __entry->dev            = dentry->d_sb->s_dev;
-               __entry->ino            = d_inode(dentry)->i_ino;
-               __entry->parent         = d_inode(dentry->d_parent)->i_ino;
-               __entry->datasync       = datasync;
-       ),
-
-       TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long) __entry->ino,
-                 (unsigned long) __entry->parent, __entry->datasync)
-);
-
-/* super-io.c: */
-TRACE_EVENT(write_super,
-       TP_PROTO(struct bch_fs *c, unsigned long ip),
-       TP_ARGS(c, ip),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev     )
-               __field(unsigned long,  ip      )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               __entry->ip             = ip;
-       ),
-
-       TP_printk("%d,%d for %pS",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (void *) __entry->ip)
-);
-
-/* io.c: */
-
-DEFINE_EVENT(bio, io_read_promote,
-       TP_PROTO(struct bio *bio),
-       TP_ARGS(bio)
-);
-
-TRACE_EVENT(io_read_nopromote,
-       TP_PROTO(struct bch_fs *c, int ret),
-       TP_ARGS(c, ret),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev             )
-               __array(char,           ret, 32         )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
-       ),
-
-       TP_printk("%d,%d ret %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ret)
-);
-
-DEFINE_EVENT(bio, io_read_bounce,
-       TP_PROTO(struct bio *bio),
-       TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_split,
-       TP_PROTO(struct bio *bio),
-       TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_retry,
-       TP_PROTO(struct bio *bio),
-       TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_reuse_race,
-       TP_PROTO(struct bio *bio),
-       TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_fail_and_poison,
-       TP_PROTO(struct bio *bio),
-       TP_ARGS(bio)
-);
-
-/* ec.c */
-
-TRACE_EVENT(stripe_create,
-       TP_PROTO(struct bch_fs *c, u64 idx, int ret),
-       TP_ARGS(c, idx, ret),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u64,            idx                     )
-               __field(int,            ret                     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev                    = c->dev;
-               __entry->idx                    = idx;
-               __entry->ret                    = ret;
-       ),
-
-       TP_printk("%d,%d idx %llu ret %i",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->idx,
-                 __entry->ret)
-);
-
-/* Journal */
-
-DEFINE_EVENT(bch_fs, journal_full,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
-);
-
-DEFINE_EVENT(fs_str, journal_entry_full,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, journal_entry_close,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(bio, journal_write,
-       TP_PROTO(struct bio *bio),
-       TP_ARGS(bio)
-);
-
-TRACE_EVENT(journal_reclaim_start,
-       TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
-                u64 min_nr, u64 min_key_cache,
-                u64 btree_cache_dirty, u64 btree_cache_total,
-                u64 btree_key_cache_dirty, u64 btree_key_cache_total),
-       TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
-               btree_cache_dirty, btree_cache_total,
-               btree_key_cache_dirty, btree_key_cache_total),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(bool,           direct                  )
-               __field(bool,           kicked                  )
-               __field(u64,            min_nr                  )
-               __field(u64,            min_key_cache           )
-               __field(u64,            btree_cache_dirty       )
-               __field(u64,            btree_cache_total       )
-               __field(u64,            btree_key_cache_dirty   )
-               __field(u64,            btree_key_cache_total   )
-       ),
-
-       TP_fast_assign(
-               __entry->dev                    = c->dev;
-               __entry->direct                 = direct;
-               __entry->kicked                 = kicked;
-               __entry->min_nr                 = min_nr;
-               __entry->min_key_cache          = min_key_cache;
-               __entry->btree_cache_dirty      = btree_cache_dirty;
-               __entry->btree_cache_total      = btree_cache_total;
-               __entry->btree_key_cache_dirty  = btree_key_cache_dirty;
-               __entry->btree_key_cache_total  = btree_key_cache_total;
-       ),
-
-       TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->direct,
-                 __entry->kicked,
-                 __entry->min_nr,
-                 __entry->min_key_cache,
-                 __entry->btree_cache_dirty,
-                 __entry->btree_cache_total,
-                 __entry->btree_key_cache_dirty,
-                 __entry->btree_key_cache_total)
-);
-
-TRACE_EVENT(journal_reclaim_finish,
-       TP_PROTO(struct bch_fs *c, u64 nr_flushed),
-       TP_ARGS(c, nr_flushed),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u64,            nr_flushed              )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               __entry->nr_flushed     = nr_flushed;
-       ),
-
-       TP_printk("%d,%d flushed %llu",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->nr_flushed)
-);
-
-/* bset.c: */
-
-DEFINE_EVENT(bpos, bkey_pack_pos_fail,
-       TP_PROTO(const struct bpos *p),
-       TP_ARGS(p)
-);
-
-/* Btree cache: */
-
-TRACE_EVENT(btree_cache_scan,
-       TP_PROTO(long nr_to_scan, long can_free, long ret),
-       TP_ARGS(nr_to_scan, can_free, ret),
-
-       TP_STRUCT__entry(
-               __field(long,   nr_to_scan              )
-               __field(long,   can_free                )
-               __field(long,   ret                     )
-       ),
-
-       TP_fast_assign(
-               __entry->nr_to_scan     = nr_to_scan;
-               __entry->can_free       = can_free;
-               __entry->ret            = ret;
-       ),
-
-       TP_printk("scanned for %li nodes, can free %li, ret %li",
-                 __entry->nr_to_scan, __entry->can_free, __entry->ret)
-);
-
-DEFINE_EVENT(btree_node_nofs, btree_cache_reap,
-       TP_PROTO(struct bch_fs *c, struct btree *b),
-       TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail,
-       TP_PROTO(struct btree_trans *trans),
-       TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock,
-       TP_PROTO(struct btree_trans *trans),
-       TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize,
-       TP_PROTO(struct btree_trans *trans),
-       TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock,
-       TP_PROTO(struct btree_trans *trans),
-       TP_ARGS(trans)
-);
-
-/* Btree */
-
-DEFINE_EVENT(btree_node, btree_node_read,
-       TP_PROTO(struct btree_trans *trans, struct btree *b),
-       TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_node_write,
-       TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
-       TP_ARGS(b, bytes, sectors),
-
-       TP_STRUCT__entry(
-               __field(enum btree_node_type,   type)
-               __field(unsigned,       bytes                   )
-               __field(unsigned,       sectors                 )
-       ),
-
-       TP_fast_assign(
-               __entry->type   = btree_node_type(b);
-               __entry->bytes  = bytes;
-               __entry->sectors = sectors;
-       ),
-
-       TP_printk("bkey type %u bytes %u sectors %u",
-                 __entry->type , __entry->bytes, __entry->sectors)
-);
-
-DEFINE_EVENT(btree_node, btree_node_alloc,
-       TP_PROTO(struct btree_trans *trans, struct btree *b),
-       TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_free,
-       TP_PROTO(struct btree_trans *trans, struct btree *b),
-       TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_reserve_get_fail,
-       TP_PROTO(const char *trans_fn,
-                unsigned long caller_ip,
-                size_t required,
-                int ret),
-       TP_ARGS(trans_fn, caller_ip, required, ret),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-               __field(size_t,                 required        )
-               __array(char,                   ret, 32         )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip      = caller_ip;
-               __entry->required       = required;
-               strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
-       ),
-
-       TP_printk("%s %pS required %zu ret %s",
-                 __entry->trans_fn,
-                 (void *) __entry->caller_ip,
-                 __entry->required,
-                 __entry->ret)
-);
-
-DEFINE_EVENT(btree_node, btree_node_compact,
-       TP_PROTO(struct btree_trans *trans, struct btree *b),
-       TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_merge,
-       TP_PROTO(struct btree_trans *trans, struct btree *b),
-       TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_split,
-       TP_PROTO(struct btree_trans *trans, struct btree *b),
-       TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_rewrite,
-       TP_PROTO(struct btree_trans *trans, struct btree *b),
-       TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_set_root,
-       TP_PROTO(struct btree_trans *trans, struct btree *b),
-       TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_path_relock_fail,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path,
-                unsigned level),
-       TP_ARGS(trans, caller_ip, path, level),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-               __field(u8,                     btree_id        )
-               __field(u8,                     level           )
-               __field(u8,                     path_idx)
-               TRACE_BPOS_entries(pos)
-               __array(char,                   node, 24        )
-               __field(u8,                     self_read_count )
-               __field(u8,                     self_intent_count)
-               __field(u8,                     read_count      )
-               __field(u8,                     intent_count    )
-               __field(u32,                    iter_lock_seq   )
-               __field(u32,                    node_lock_seq   )
-       ),
-
-       TP_fast_assign(
-               struct btree *b = btree_path_node(path, level);
-               struct six_lock_count c;
-
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-               __entry->btree_id               = path->btree_id;
-               __entry->level                  = level;
-               __entry->path_idx               = path - trans->paths;
-               TRACE_BPOS_assign(pos, path->pos);
-
-               c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
-               __entry->self_read_count        = c.n[SIX_LOCK_read];
-               __entry->self_intent_count      = c.n[SIX_LOCK_intent];
-
-               if (IS_ERR(b)) {
-                       strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
-               } else {
-                       c = six_lock_counts(&path->l[level].b->c.lock);
-                       __entry->read_count     = c.n[SIX_LOCK_read];
-                       __entry->intent_count   = c.n[SIX_LOCK_intent];
-                       scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
-               }
-               __entry->iter_lock_seq          = path->l[level].lock_seq;
-               __entry->node_lock_seq          = is_btree_node(path, level)
-                       ? six_lock_seq(&path->l[level].b->c.lock)
-                       : 0;
-       ),
-
-       TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
-                 __entry->trans_fn,
-                 (void *) __entry->caller_ip,
-                 __entry->path_idx,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot,
-                 __entry->level,
-                 __entry->node,
-                 __entry->self_read_count,
-                 __entry->self_intent_count,
-                 __entry->read_count,
-                 __entry->intent_count,
-                 __entry->iter_lock_seq,
-                 __entry->node_lock_seq)
-);
-
-TRACE_EVENT(btree_path_upgrade_fail,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path,
-                unsigned level),
-       TP_ARGS(trans, caller_ip, path, level),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-               __field(u8,                     btree_id        )
-               __field(u8,                     level           )
-               __field(u8,                     path_idx)
-               TRACE_BPOS_entries(pos)
-               __field(u8,                     locked          )
-               __field(u8,                     self_read_count )
-               __field(u8,                     self_intent_count)
-               __field(u8,                     read_count      )
-               __field(u8,                     intent_count    )
-               __field(u32,                    iter_lock_seq   )
-               __field(u32,                    node_lock_seq   )
-       ),
-
-       TP_fast_assign(
-               struct six_lock_count c;
-
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-               __entry->btree_id               = path->btree_id;
-               __entry->level                  = level;
-               __entry->path_idx               = path - trans->paths;
-               TRACE_BPOS_assign(pos, path->pos);
-               __entry->locked                 = btree_node_locked(path, level);
-
-               c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
-               __entry->self_read_count        = c.n[SIX_LOCK_read];
-               __entry->self_intent_count      = c.n[SIX_LOCK_intent];
-               c = six_lock_counts(&path->l[level].b->c.lock);
-               __entry->read_count             = c.n[SIX_LOCK_read];
-               __entry->intent_count           = c.n[SIX_LOCK_intent];
-               __entry->iter_lock_seq          = path->l[level].lock_seq;
-               __entry->node_lock_seq          = is_btree_node(path, level)
-                       ? six_lock_seq(&path->l[level].b->c.lock)
-                       : 0;
-       ),
-
-       TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
-                 __entry->trans_fn,
-                 (void *) __entry->caller_ip,
-                 __entry->path_idx,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot,
-                 __entry->level,
-                 __entry->locked,
-                 __entry->self_read_count,
-                 __entry->self_intent_count,
-                 __entry->read_count,
-                 __entry->intent_count,
-                 __entry->iter_lock_seq,
-                 __entry->node_lock_seq)
-);
-
-/* Garbage collection */
-
-DEFINE_EVENT(bch_fs, gc_gens_start,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_gens_end,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
-);
-
-/* Allocator */
-
-DEFINE_EVENT(fs_str, bucket_alloc,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, bucket_alloc_fail,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DECLARE_EVENT_CLASS(discard_buckets_class,
-       TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
-                u64 need_journal_commit, u64 discarded, const char *err),
-       TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u64,            seen                    )
-               __field(u64,            open                    )
-               __field(u64,            need_journal_commit     )
-               __field(u64,            discarded               )
-               __array(char,           err,    16              )
-       ),
-
-       TP_fast_assign(
-               __entry->dev                    = c->dev;
-               __entry->seen                   = seen;
-               __entry->open                   = open;
-               __entry->need_journal_commit    = need_journal_commit;
-               __entry->discarded              = discarded;
-               strscpy(__entry->err, err, sizeof(__entry->err));
-       ),
-
-       TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->seen,
-                 __entry->open,
-                 __entry->need_journal_commit,
-                 __entry->discarded,
-                 __entry->err)
-);
-
-DEFINE_EVENT(discard_buckets_class, discard_buckets,
-       TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
-                u64 need_journal_commit, u64 discarded, const char *err),
-       TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
-);
-
-DEFINE_EVENT(discard_buckets_class, discard_buckets_fast,
-       TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
-                u64 need_journal_commit, u64 discarded, const char *err),
-       TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
-);
-
-TRACE_EVENT(bucket_invalidate,
-       TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
-       TP_ARGS(c, dev, bucket, sectors),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u32,            dev_idx                 )
-               __field(u32,            sectors                 )
-               __field(u64,            bucket                  )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               __entry->dev_idx        = dev;
-               __entry->sectors        = sectors;
-               __entry->bucket         = bucket;
-       ),
-
-       TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->dev_idx, __entry->bucket,
-                 __entry->sectors)
-);
-
-/* Moving IO */
-
-DEFINE_EVENT(fs_str, io_move,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_read,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_write,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_finish,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_fail,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_write_fail,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_start_fail,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-TRACE_EVENT(move_data,
-       TP_PROTO(struct bch_fs *c,
-                struct bch_move_stats *stats),
-       TP_ARGS(c, stats),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev             )
-               __field(u64,            keys_moved      )
-               __field(u64,            keys_raced      )
-               __field(u64,            sectors_seen    )
-               __field(u64,            sectors_moved   )
-               __field(u64,            sectors_raced   )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               __entry->keys_moved     = atomic64_read(&stats->keys_moved);
-               __entry->keys_raced     = atomic64_read(&stats->keys_raced);
-               __entry->sectors_seen   = atomic64_read(&stats->sectors_seen);
-               __entry->sectors_moved  = atomic64_read(&stats->sectors_moved);
-               __entry->sectors_raced  = atomic64_read(&stats->sectors_raced);
-       ),
-
-       TP_printk("%d,%d keys moved %llu raced %llu"
-                 "sectors seen %llu moved %llu raced %llu",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->keys_moved,
-                 __entry->keys_raced,
-                 __entry->sectors_seen,
-                 __entry->sectors_moved,
-                 __entry->sectors_raced)
-);
-
-TRACE_EVENT(copygc,
-       TP_PROTO(struct bch_fs *c,
-                u64 buckets,
-                u64 sectors_seen,
-                u64 sectors_moved),
-       TP_ARGS(c, buckets, sectors_seen, sectors_moved),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u64,            buckets                 )
-               __field(u64,            sectors_seen            )
-               __field(u64,            sectors_moved           )
-       ),
-
-       TP_fast_assign(
-               __entry->dev                    = c->dev;
-               __entry->buckets                = buckets;
-               __entry->sectors_seen           = sectors_seen;
-               __entry->sectors_moved          = sectors_moved;
-       ),
-
-       TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->buckets,
-                 __entry->sectors_seen,
-                 __entry->sectors_moved)
-);
-
-TRACE_EVENT(copygc_wait,
-       TP_PROTO(struct bch_fs *c,
-                u64 wait_amount, u64 until),
-       TP_ARGS(c, wait_amount, until),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u64,            wait_amount             )
-               __field(u64,            until                   )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               __entry->wait_amount    = wait_amount;
-               __entry->until          = until;
-       ),
-
-       TP_printk("%d,%u waiting for %llu sectors until %llu",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->wait_amount, __entry->until)
-);
-
-/* btree transactions: */
-
-DECLARE_EVENT_CLASS(transaction_event,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip),
-       TP_ARGS(trans, caller_ip),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-       ),
-
-       TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
-);
-
-DEFINE_EVENT(transaction_event,        transaction_commit,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip),
-       TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(transaction_event,        trans_restart_injected,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip),
-       TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(trans_restart_split_race,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree *b),
-       TP_ARGS(trans, caller_ip, b),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-               __field(u8,                     level           )
-               __field(u16,                    written         )
-               __field(u16,                    blocks          )
-               __field(u16,                    u64s_remaining  )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-               __entry->level          = b->c.level;
-               __entry->written        = b->written;
-               __entry->blocks         = btree_blocks(trans->c);
-               __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b);
-       ),
-
-       TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
-                 __entry->trans_fn, (void *) __entry->caller_ip,
-                 __entry->level,
-                 __entry->written, __entry->blocks,
-                 __entry->u64s_remaining)
-);
-
-TRACE_EVENT(trans_blocked_journal_reclaim,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip),
-       TP_ARGS(trans, caller_ip),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-
-               __field(unsigned long,          key_cache_nr_keys       )
-               __field(unsigned long,          key_cache_nr_dirty      )
-               __field(long,                   must_wait               )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-               __entry->key_cache_nr_keys      = atomic_long_read(&trans->c->btree_key_cache.nr_keys);
-               __entry->key_cache_nr_dirty     = atomic_long_read(&trans->c->btree_key_cache.nr_dirty);
-               __entry->must_wait              = __bch2_btree_key_cache_must_wait(trans->c);
-       ),
-
-       TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li",
-                 __entry->trans_fn, (void *) __entry->caller_ip,
-                 __entry->key_cache_nr_keys,
-                 __entry->key_cache_nr_dirty,
-                 __entry->must_wait)
-);
-
-#if 0
-/* todo: bring back dynamic fault injection */
-DEFINE_EVENT(transaction_event,        trans_restart_fault_inject,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip),
-       TP_ARGS(trans, caller_ip)
-);
-#endif
-
-DEFINE_EVENT(transaction_event,        trans_traverse_all,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip),
-       TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(transaction_event,        trans_restart_key_cache_raced,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip),
-       TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(trans_str, trans_restart_too_many_iters,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                const char *paths),
-       TP_ARGS(trans, caller_ip, paths)
-);
-
-DECLARE_EVENT_CLASS(transaction_restart_iter,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path),
-       TP_ARGS(trans, caller_ip, path),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-               __field(u8,                     btree_id        )
-               TRACE_BPOS_entries(pos)
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-               __entry->btree_id               = path->btree_id;
-               TRACE_BPOS_assign(pos, path->pos)
-       ),
-
-       TP_printk("%s %pS btree %s pos %llu:%llu:%u",
-                 __entry->trans_fn,
-                 (void *) __entry->caller_ip,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path),
-       TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path),
-       TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(fs_str, trans_restart_upgrade,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(trans_str,        trans_restart_relock,
-       TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
-       TP_ARGS(trans, caller_ip, str)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path),
-       TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path),
-       TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path),
-       TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path),
-       TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path),
-       TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path),
-       TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock,
-       TP_PROTO(struct btree_trans *trans,
-                const char *cycle),
-       TP_ARGS(trans, cycle)
-);
-
-DEFINE_EVENT(transaction_event,        trans_restart_would_deadlock_recursion_limit,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip),
-       TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(trans_restart_would_deadlock_write,
-       TP_PROTO(struct btree_trans *trans),
-       TP_ARGS(trans),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-       ),
-
-       TP_printk("%s", __entry->trans_fn)
-);
-
-TRACE_EVENT(trans_restart_mem_realloced,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                unsigned long bytes),
-       TP_ARGS(trans, caller_ip, bytes),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-               __field(unsigned long,          bytes           )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip      = caller_ip;
-               __entry->bytes          = bytes;
-       ),
-
-       TP_printk("%s %pS bytes %lu",
-                 __entry->trans_fn,
-                 (void *) __entry->caller_ip,
-                 __entry->bytes)
-);
-
-DEFINE_EVENT(transaction_event,        trans_restart_write_buffer_flush,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip),
-       TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(path_downgrade,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path,
-                unsigned old_locks_want),
-       TP_ARGS(trans, caller_ip, path, old_locks_want),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-               __field(unsigned,               old_locks_want  )
-               __field(unsigned,               new_locks_want  )
-               __field(unsigned,               btree           )
-               TRACE_BPOS_entries(pos)
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-               __entry->old_locks_want         = old_locks_want;
-               __entry->new_locks_want         = path->locks_want;
-               __entry->btree                  = path->btree_id;
-               TRACE_BPOS_assign(pos, path->pos);
-       ),
-
-       TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u",
-                 __entry->trans_fn,
-                 (void *) __entry->caller_ip,
-                 __entry->old_locks_want,
-                 __entry->new_locks_want,
-                 bch2_btree_id_str(__entry->btree),
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot)
-);
-
-TRACE_EVENT(key_cache_fill,
-       TP_PROTO(struct btree_trans *trans, const char *key),
-       TP_ARGS(trans, key),
-
-       TP_STRUCT__entry(
-               __array(char,           trans_fn, 32    )
-               __string(key,           key                     )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __assign_str(key);
-       ),
-
-       TP_printk("%s %s", __entry->trans_fn, __get_str(key))
-);
-
-TRACE_EVENT(write_buffer_flush,
-       TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
-       TP_ARGS(trans, nr, skipped, fast, size),
-
-       TP_STRUCT__entry(
-               __field(size_t,         nr              )
-               __field(size_t,         skipped         )
-               __field(size_t,         fast            )
-               __field(size_t,         size            )
-       ),
-
-       TP_fast_assign(
-               __entry->nr     = nr;
-               __entry->skipped = skipped;
-               __entry->fast   = fast;
-               __entry->size   = size;
-       ),
-
-       TP_printk("%zu/%zu skipped %zu fast %zu",
-                 __entry->nr, __entry->size, __entry->skipped, __entry->fast)
-);
-
-TRACE_EVENT(write_buffer_flush_sync,
-       TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
-       TP_ARGS(trans, caller_ip),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-       ),
-
-       TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
-);
-
-TRACE_EVENT(write_buffer_flush_slowpath,
-       TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
-       TP_ARGS(trans, slowpath, total),
-
-       TP_STRUCT__entry(
-               __field(size_t,         slowpath        )
-               __field(size_t,         total           )
-       ),
-
-       TP_fast_assign(
-               __entry->slowpath       = slowpath;
-               __entry->total          = total;
-       ),
-
-       TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
-);
-
-TRACE_EVENT(write_buffer_maybe_flush,
-       TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key),
-       TP_ARGS(trans, caller_ip, key),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-               __string(key,                   key             )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __assign_str(key);
-       ),
-
-       TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key))
-);
-
-DEFINE_EVENT(fs_str, rebalance_extent,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, data_update,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_pred,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_created_rebalance,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, extent_trim_atomic,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, btree_iter_peek_slot,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, __btree_iter_peek,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, btree_iter_peek_max,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, btree_iter_peek_prev_min,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
-#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
-
-TRACE_EVENT(update_by_path,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path,
-                struct btree_insert_entry *i, bool overwrite),
-       TP_ARGS(trans, path, i, overwrite),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(btree_path_idx_t,       path_idx        )
-               __field(u8,                     btree_id        )
-               TRACE_BPOS_entries(pos)
-               __field(u8,                     overwrite       )
-               __field(btree_path_idx_t,       update_idx      )
-               __field(btree_path_idx_t,       nr_updates      )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->path_idx               = path - trans->paths;
-               __entry->btree_id               = path->btree_id;
-               TRACE_BPOS_assign(pos, path->pos);
-               __entry->overwrite              = overwrite;
-               __entry->update_idx             = i - trans->updates;
-               __entry->nr_updates             = trans->nr_updates;
-       ),
-
-       TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
-                 __entry->trans_fn,
-                 __entry->path_idx,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot,
-                 __entry->overwrite,
-                 __entry->update_idx,
-                 __entry->nr_updates)
-);
-
-TRACE_EVENT(btree_path_lock,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_bkey_cached_common *b),
-       TP_ARGS(trans, caller_ip, b),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(unsigned long,          caller_ip       )
-               __field(u8,                     btree_id        )
-               __field(u8,                     level           )
-               __array(char,                   node, 24        )
-               __field(u32,                    lock_seq        )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
-               __entry->btree_id               = b->btree_id;
-               __entry->level                  = b->level;
-
-               scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
-               __entry->lock_seq               = six_lock_seq(&b->lock);
-       ),
-
-       TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
-                 __entry->trans_fn,
-                 (void *) __entry->caller_ip,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->level,
-                 __entry->node,
-                 __entry->lock_seq)
-);
-
-DECLARE_EVENT_CLASS(btree_path_ev,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-       TP_ARGS(trans, path),
-
-       TP_STRUCT__entry(
-               __field(u16,                    idx             )
-               __field(u8,                     ref             )
-               __field(u8,                     btree_id        )
-               TRACE_BPOS_entries(pos)
-       ),
-
-       TP_fast_assign(
-               __entry->idx                    = path - trans->paths;
-               __entry->ref                    = path->ref;
-               __entry->btree_id               = path->btree_id;
-               TRACE_BPOS_assign(pos, path->pos);
-       ),
-
-       TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
-                 __entry->idx, __entry->ref,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-       TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-       TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-       TP_ARGS(trans, path)
-);
-
-TRACE_EVENT(btree_path_alloc,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-       TP_ARGS(trans, path),
-
-       TP_STRUCT__entry(
-               __field(btree_path_idx_t,       idx             )
-               __field(u8,                     locks_want      )
-               __field(u8,                     btree_id        )
-               TRACE_BPOS_entries(pos)
-       ),
-
-       TP_fast_assign(
-               __entry->idx                    = path - trans->paths;
-               __entry->locks_want             = path->locks_want;
-               __entry->btree_id               = path->btree_id;
-               TRACE_BPOS_assign(pos, path->pos);
-       ),
-
-       TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
-                 __entry->idx,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->locks_want,
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot)
-);
-
-TRACE_EVENT(btree_path_get,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
-       TP_ARGS(trans, path, new_pos),
-
-       TP_STRUCT__entry(
-               __field(btree_path_idx_t,       idx             )
-               __field(u8,                     ref             )
-               __field(u8,                     preserve        )
-               __field(u8,                     locks_want      )
-               __field(u8,                     btree_id        )
-               TRACE_BPOS_entries(old_pos)
-               TRACE_BPOS_entries(new_pos)
-       ),
-
-       TP_fast_assign(
-               __entry->idx                    = path - trans->paths;
-               __entry->ref                    = path->ref;
-               __entry->preserve               = path->preserve;
-               __entry->locks_want             = path->locks_want;
-               __entry->btree_id               = path->btree_id;
-               TRACE_BPOS_assign(old_pos, path->pos);
-               TRACE_BPOS_assign(new_pos, *new_pos);
-       ),
-
-       TP_printk("    path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
-                 __entry->idx,
-                 __entry->ref,
-                 __entry->preserve,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->locks_want,
-                 __entry->old_pos_inode,
-                 __entry->old_pos_offset,
-                 __entry->old_pos_snapshot,
-                 __entry->new_pos_inode,
-                 __entry->new_pos_offset,
-                 __entry->new_pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(btree_path_clone,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
-       TP_ARGS(trans, path, new),
-
-       TP_STRUCT__entry(
-               __field(btree_path_idx_t,       idx             )
-               __field(u8,                     new_idx         )
-               __field(u8,                     btree_id        )
-               __field(u8,                     ref             )
-               __field(u8,                     preserve        )
-               TRACE_BPOS_entries(pos)
-       ),
-
-       TP_fast_assign(
-               __entry->idx                    = path - trans->paths;
-               __entry->new_idx                = new - trans->paths;
-               __entry->btree_id               = path->btree_id;
-               __entry->ref                    = path->ref;
-               __entry->preserve               = path->preserve;
-               TRACE_BPOS_assign(pos, path->pos);
-       ),
-
-       TP_printk("  path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
-                 __entry->idx,
-                 __entry->ref,
-                 __entry->preserve,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot,
-                 __entry->new_idx)
-);
-
-DEFINE_EVENT(btree_path_clone, btree_path_clone,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
-       TP_ARGS(trans, path, new)
-);
-
-DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
-       TP_ARGS(trans, path, new)
-);
-
-DECLARE_EVENT_CLASS(btree_path_traverse,
-       TP_PROTO(struct btree_trans *trans,
-                struct btree_path *path),
-       TP_ARGS(trans, path),
-
-       TP_STRUCT__entry(
-               __array(char,                   trans_fn, 32    )
-               __field(btree_path_idx_t,       idx             )
-               __field(u8,                     ref             )
-               __field(u8,                     preserve        )
-               __field(u8,                     should_be_locked )
-               __field(u8,                     btree_id        )
-               __field(u8,                     level           )
-               TRACE_BPOS_entries(pos)
-               __field(u8,                     locks_want      )
-               __field(u8,                     nodes_locked    )
-               __array(char,                   node0, 24       )
-               __array(char,                   node1, 24       )
-               __array(char,                   node2, 24       )
-               __array(char,                   node3, 24       )
-       ),
-
-       TP_fast_assign(
-               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-
-               __entry->idx                    = path - trans->paths;
-               __entry->ref                    = path->ref;
-               __entry->preserve               = path->preserve;
-               __entry->btree_id               = path->btree_id;
-               __entry->level                  = path->level;
-               TRACE_BPOS_assign(pos, path->pos);
-
-               __entry->locks_want             = path->locks_want;
-               __entry->nodes_locked           = path->nodes_locked;
-               struct btree *b = path->l[0].b;
-               if (IS_ERR(b))
-                       strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-               else
-                       scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
-               b = path->l[1].b;
-               if (IS_ERR(b))
-                       strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-               else
-                       scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
-               b = path->l[2].b;
-               if (IS_ERR(b))
-                       strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-               else
-                       scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
-               b = path->l[3].b;
-               if (IS_ERR(b))
-                       strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-               else
-                       scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
-       ),
-
-       TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
-                 "locks %u %u %u %u node %s %s %s %s",
-                 __entry->trans_fn,
-                 __entry->idx,
-                 __entry->ref,
-                 __entry->preserve,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot,
-                 __entry->level,
-                 __entry->locks_want,
-                 (__entry->nodes_locked >> 6) & 3,
-                 (__entry->nodes_locked >> 4) & 3,
-                 (__entry->nodes_locked >> 2) & 3,
-                 (__entry->nodes_locked >> 0) & 3,
-                 __entry->node3,
-                 __entry->node2,
-                 __entry->node1,
-                 __entry->node0)
-);
-
-DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
-       TP_PROTO(struct btree_trans *trans,
-                struct btree_path *path),
-       TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
-       TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-       TP_ARGS(trans, path)
-);
-
-TRACE_EVENT(btree_path_set_pos,
-       TP_PROTO(struct btree_trans *trans,
-                struct btree_path *path,
-                struct bpos *new_pos),
-       TP_ARGS(trans, path, new_pos),
-
-       TP_STRUCT__entry(
-               __field(btree_path_idx_t,       idx             )
-               __field(u8,                     ref             )
-               __field(u8,                     preserve        )
-               __field(u8,                     btree_id        )
-               TRACE_BPOS_entries(old_pos)
-               TRACE_BPOS_entries(new_pos)
-               __field(u8,                     locks_want      )
-               __field(u8,                     nodes_locked    )
-               __array(char,                   node0, 24       )
-               __array(char,                   node1, 24       )
-               __array(char,                   node2, 24       )
-               __array(char,                   node3, 24       )
-       ),
-
-       TP_fast_assign(
-               __entry->idx                    = path - trans->paths;
-               __entry->ref                    = path->ref;
-               __entry->preserve               = path->preserve;
-               __entry->btree_id               = path->btree_id;
-               TRACE_BPOS_assign(old_pos, path->pos);
-               TRACE_BPOS_assign(new_pos, *new_pos);
-
-               __entry->nodes_locked           = path->nodes_locked;
-               struct btree *b = path->l[0].b;
-               if (IS_ERR(b))
-                       strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-               else
-                       scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
-               b = path->l[1].b;
-               if (IS_ERR(b))
-                       strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-               else
-                       scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
-               b = path->l[2].b;
-               if (IS_ERR(b))
-                       strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-               else
-                       scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
-               b = path->l[3].b;
-               if (IS_ERR(b))
-                       strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-               else
-                       scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
-       ),
-
-       TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
-                 "locks %u %u %u %u node %s %s %s %s",
-                 __entry->idx,
-                 __entry->ref,
-                 __entry->preserve,
-                 bch2_btree_id_str(__entry->btree_id),
-                 __entry->old_pos_inode,
-                 __entry->old_pos_offset,
-                 __entry->old_pos_snapshot,
-                 __entry->new_pos_inode,
-                 __entry->new_pos_offset,
-                 __entry->new_pos_snapshot,
-                 (__entry->nodes_locked >> 6) & 3,
-                 (__entry->nodes_locked >> 4) & 3,
-                 (__entry->nodes_locked >> 2) & 3,
-                 (__entry->nodes_locked >> 0) & 3,
-                 __entry->node3,
-                 __entry->node2,
-                 __entry->node1,
-                 __entry->node0)
-);
-
-TRACE_EVENT(btree_path_free,
-       TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
-       TP_ARGS(trans, path, dup),
-
-       TP_STRUCT__entry(
-               __field(btree_path_idx_t,       idx             )
-               __field(u8,                     preserve        )
-               __field(u8,                     should_be_locked)
-               __field(s8,                     dup             )
-               __field(u8,                     dup_locked      )
-       ),
-
-       TP_fast_assign(
-               __entry->idx                    = path;
-               __entry->preserve               = trans->paths[path].preserve;
-               __entry->should_be_locked       = trans->paths[path].should_be_locked;
-               __entry->dup                    = dup ? dup - trans->paths  : -1;
-               __entry->dup_locked             = dup ? btree_node_locked(dup, dup->level) : 0;
-       ),
-
-       TP_printk("   path %3u %c %c dup %2i locked %u", __entry->idx,
-                 __entry->preserve ? 'P' : ' ',
-                 __entry->should_be_locked ? 'S' : ' ',
-                 __entry->dup,
-                 __entry->dup_locked)
-);
-
-#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
-#ifndef _TRACE_BCACHEFS_H
-
-static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
-                                       struct btree_insert_entry *i, bool overwrite) {}
-static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
-static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
-static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
-static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
-static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
-static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
-
-#endif
-#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
-
-#define _TRACE_BCACHEFS_H
-#endif /* _TRACE_BCACHEFS_H */
-
-/* This part must be outside protection */
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../fs/bcachefs
-
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-#include <trace/define_trace.h>
diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c
deleted file mode 100644 (file)
index 9764c2e..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "two_state_shared_lock.h"
-
-void __bch2_two_state_lock(two_state_lock_t *lock, int s)
-{
-       __wait_event(lock->wait, bch2_two_state_trylock(lock, s));
-}
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
deleted file mode 100644 (file)
index 7f64784..0000000
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_TWO_STATE_LOCK_H
-#define _BCACHEFS_TWO_STATE_LOCK_H
-
-#include <linux/atomic.h>
-#include <linux/sched.h>
-#include <linux/wait.h>
-
-#include "util.h"
-
-/*
- * Two-state lock - can be taken for add or block - both states are shared,
- * like read side of rwsem, but conflict with other state:
- */
-typedef struct {
-       atomic_long_t           v;
-       wait_queue_head_t       wait;
-} two_state_lock_t;
-
-static inline void two_state_lock_init(two_state_lock_t *lock)
-{
-       atomic_long_set(&lock->v, 0);
-       init_waitqueue_head(&lock->wait);
-}
-
-static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
-{
-       long i = s ? 1 : -1;
-
-       EBUG_ON(atomic_long_read(&lock->v) == 0);
-
-       if (atomic_long_sub_return_release(i, &lock->v) == 0)
-               wake_up_all(&lock->wait);
-}
-
-static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
-{
-       long i = s ? 1 : -1;
-       long old;
-
-       old = atomic_long_read(&lock->v);
-       do {
-               if (i > 0 ? old < 0 : old > 0)
-                       return false;
-       } while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i));
-
-       return true;
-}
-
-void __bch2_two_state_lock(two_state_lock_t *, int);
-
-static inline void bch2_two_state_lock(two_state_lock_t *lock, int s)
-{
-       if (!bch2_two_state_trylock(lock, s))
-               __bch2_two_state_lock(lock, s);
-}
-
-#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
deleted file mode 100644 (file)
index df9a607..0000000
+++ /dev/null
@@ -1,1047 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * random utility code, for bcache but in theory not specific to bcache
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/console.h>
-#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/log2.h>
-#include <linux/math64.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/sched/clock.h>
-
-#include "eytzinger.h"
-#include "mean_and_variance.h"
-#include "util.h"
-
-static const char si_units[] = "?kMGTPEZY";
-
-/* string_get_size units: */
-static const char *const units_2[] = {
-       "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
-};
-static const char *const units_10[] = {
-       "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
-};
-
-static int parse_u64(const char *cp, u64 *res)
-{
-       const char *start = cp;
-       u64 v = 0;
-
-       if (!isdigit(*cp))
-               return -EINVAL;
-
-       do {
-               if (v > U64_MAX / 10)
-                       return -ERANGE;
-               v *= 10;
-               if (v > U64_MAX - (*cp - '0'))
-                       return -ERANGE;
-               v += *cp - '0';
-               cp++;
-       } while (isdigit(*cp));
-
-       *res = v;
-       return cp - start;
-}
-
-static int bch2_pow(u64 n, u64 p, u64 *res)
-{
-       *res = 1;
-
-       while (p--) {
-               if (*res > div64_u64(U64_MAX, n))
-                       return -ERANGE;
-               *res *= n;
-       }
-       return 0;
-}
-
-static int parse_unit_suffix(const char *cp, u64 *res)
-{
-       const char *start = cp;
-       u64 base = 1024;
-       unsigned u;
-       int ret;
-
-       if (*cp == ' ')
-               cp++;
-
-       for (u = 1; u < strlen(si_units); u++)
-               if (*cp == si_units[u]) {
-                       cp++;
-                       goto got_unit;
-               }
-
-       for (u = 0; u < ARRAY_SIZE(units_2); u++)
-               if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
-                       cp += strlen(units_2[u]);
-                       goto got_unit;
-               }
-
-       for (u = 0; u < ARRAY_SIZE(units_10); u++)
-               if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
-                       cp += strlen(units_10[u]);
-                       base = 1000;
-                       goto got_unit;
-               }
-
-       *res = 1;
-       return 0;
-got_unit:
-       ret = bch2_pow(base, u, res);
-       if (ret)
-               return ret;
-
-       return cp - start;
-}
-
-#define parse_or_ret(cp, _f)                   \
-do {                                           \
-       int _ret = _f;                          \
-       if (_ret < 0)                           \
-               return _ret;                    \
-       cp += _ret;                             \
-} while (0)
-
-static int __bch2_strtou64_h(const char *cp, u64 *res)
-{
-       const char *start = cp;
-       u64 v = 0, b, f_n = 0, f_d = 1;
-       int ret;
-
-       parse_or_ret(cp, parse_u64(cp, &v));
-
-       if (*cp == '.') {
-               cp++;
-               ret = parse_u64(cp, &f_n);
-               if (ret < 0)
-                       return ret;
-               cp += ret;
-
-               ret = bch2_pow(10, ret, &f_d);
-               if (ret)
-                       return ret;
-       }
-
-       parse_or_ret(cp, parse_unit_suffix(cp, &b));
-
-       if (v > div64_u64(U64_MAX, b))
-               return -ERANGE;
-       v *= b;
-
-       if (f_n > div64_u64(U64_MAX, b))
-               return -ERANGE;
-
-       f_n = div64_u64(f_n * b, f_d);
-       if (v + f_n < v)
-               return -ERANGE;
-       v += f_n;
-
-       *res = v;
-       return cp - start;
-}
-
-static int __bch2_strtoh(const char *cp, u64 *res,
-                        u64 t_max, bool t_signed)
-{
-       bool positive = *cp != '-';
-       u64 v = 0;
-
-       if (*cp == '+' || *cp == '-')
-               cp++;
-
-       parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
-
-       if (*cp == '\n')
-               cp++;
-       if (*cp)
-               return -EINVAL;
-
-       if (positive) {
-               if (v > t_max)
-                       return -ERANGE;
-       } else {
-               if (v && !t_signed)
-                       return -ERANGE;
-
-               if (v > t_max + 1)
-                       return -ERANGE;
-               v = -v;
-       }
-
-       *res = v;
-       return 0;
-}
-
-#define STRTO_H(name, type)                                    \
-int bch2_ ## name ## _h(const char *cp, type *res)             \
-{                                                              \
-       u64 v = 0;                                              \
-       int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),      \
-                       ANYSINT_MAX(type) != ((type) ~0ULL));   \
-       *res = v;                                               \
-       return ret;                                             \
-}
-
-STRTO_H(strtoint, int)
-STRTO_H(strtouint, unsigned int)
-STRTO_H(strtoll, long long)
-STRTO_H(strtoull, unsigned long long)
-STRTO_H(strtou64, u64)
-
-u64 bch2_read_flag_list(const char *opt, const char * const list[])
-{
-       u64 ret = 0;
-       char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
-
-       if (!d)
-               return -ENOMEM;
-
-       s = strim(d);
-
-       while ((p = strsep(&s, ",;"))) {
-               int flag = match_string(list, -1, p);
-
-               if (flag < 0) {
-                       ret = -1;
-                       break;
-               }
-
-               ret |= BIT_ULL(flag);
-       }
-
-       kfree(d);
-
-       return ret;
-}
-
-bool bch2_is_zero(const void *_p, size_t n)
-{
-       const char *p = _p;
-       size_t i;
-
-       for (i = 0; i < n; i++)
-               if (p[i])
-                       return false;
-       return true;
-}
-
-void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
-{
-       while (nr_bits)
-               prt_char(out, '0' + ((v >> --nr_bits) & 1));
-}
-
-void bch2_prt_u64_base2(struct printbuf *out, u64 v)
-{
-       bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
-}
-
-static bool string_is_spaces(const char *str)
-{
-       while (*str) {
-               if (*str != ' ')
-                       return false;
-               str++;
-       }
-       return true;
-}
-
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
-{
-       bool locked = false;
-       const char *p;
-
-       if (!lines) {
-               printk("%s (null)\n", prefix);
-               return;
-       }
-
-       locked = console_trylock();
-
-       while (*lines) {
-               p = strchrnul(lines, '\n');
-               if (!*p && string_is_spaces(lines))
-                       break;
-
-               printk("%s%.*s\n", prefix, (int) (p - lines), lines);
-               if (!*p)
-                       break;
-               lines = p + 1;
-       }
-       if (locked)
-               console_unlock();
-}
-
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
-                       gfp_t gfp)
-{
-#ifdef CONFIG_STACKTRACE
-       unsigned nr_entries = 0;
-
-       stack->nr = 0;
-       int ret = darray_make_room_gfp(stack, 32, gfp);
-       if (ret)
-               return ret;
-
-       if (!down_read_trylock(&task->signal->exec_update_lock))
-               return -1;
-
-       do {
-               nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
-       } while (nr_entries == stack->size &&
-                !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
-
-       stack->nr = nr_entries;
-       up_read(&task->signal->exec_update_lock);
-
-       return ret;
-#else
-       return 0;
-#endif
-}
-
-void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
-{
-       darray_for_each(*stack, i) {
-               prt_printf(out, "[<0>] %pB", (void *) *i);
-               prt_newline(out);
-       }
-}
-
-int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
-{
-       bch_stacktrace stack = { 0 };
-       int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
-
-       bch2_prt_backtrace(out, &stack);
-       darray_exit(&stack);
-       return ret;
-}
-
-#ifndef __KERNEL__
-#include <time.h>
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
-       time_t t = sec;
-       char buf[64];
-       ctime_r(&t, buf);
-       strim(buf);
-       prt_str(out, buf);
-}
-#else
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
-       char buf[64];
-       snprintf(buf, sizeof(buf), "%ptT", &sec);
-       prt_u64(out, sec);
-}
-#endif
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
-       const struct time_unit *u = bch2_pick_time_units(ns);
-
-       prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
-}
-
-static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
-{
-       const struct time_unit *u = bch2_pick_time_units(ns);
-
-       prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name);
-}
-
-static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
-{
-       prt_printf(out, "%s\t", name);
-       bch2_pr_time_units_aligned(out, ns);
-       prt_newline(out);
-}
-
-#define TABSTOP_SIZE 12
-
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
-{
-       struct quantiles *quantiles = time_stats_to_quantiles(stats);
-       s64 f_mean = 0, d_mean = 0;
-       u64 f_stddev = 0, d_stddev = 0;
-
-       if (stats->buffer) {
-               int cpu;
-
-               spin_lock_irq(&stats->lock);
-               for_each_possible_cpu(cpu)
-                       __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
-               spin_unlock_irq(&stats->lock);
-       }
-
-       /*
-        * avoid divide by zero
-        */
-       if (stats->freq_stats.n) {
-               f_mean = mean_and_variance_get_mean(stats->freq_stats);
-               f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
-               d_mean = mean_and_variance_get_mean(stats->duration_stats);
-               d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
-       }
-
-       printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
-       prt_printf(out, "count:\t%llu\n", stats->duration_stats.n);
-       printbuf_tabstop_pop(out);
-
-       printbuf_tabstops_reset(out);
-
-       printbuf_tabstop_push(out, out->indent + 20);
-       printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
-       printbuf_tabstop_push(out, 0);
-       printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
-
-       prt_printf(out, "\tsince mount\r\trecent\r\n");
-
-       printbuf_tabstops_reset(out);
-       printbuf_tabstop_push(out, out->indent + 20);
-       printbuf_tabstop_push(out, TABSTOP_SIZE);
-       printbuf_tabstop_push(out, 2);
-       printbuf_tabstop_push(out, TABSTOP_SIZE);
-
-       prt_printf(out, "duration of events\n");
-       printbuf_indent_add(out, 2);
-
-       pr_name_and_units(out, "min:", stats->min_duration);
-       pr_name_and_units(out, "max:", stats->max_duration);
-       pr_name_and_units(out, "total:", stats->total_duration);
-
-       prt_printf(out, "mean:\t");
-       bch2_pr_time_units_aligned(out, d_mean);
-       prt_tab(out);
-       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
-       prt_newline(out);
-
-       prt_printf(out, "stddev:\t");
-       bch2_pr_time_units_aligned(out, d_stddev);
-       prt_tab(out);
-       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
-
-       printbuf_indent_sub(out, 2);
-       prt_newline(out);
-
-       prt_printf(out, "time between events\n");
-       printbuf_indent_add(out, 2);
-
-       pr_name_and_units(out, "min:", stats->min_freq);
-       pr_name_and_units(out, "max:", stats->max_freq);
-
-       prt_printf(out, "mean:\t");
-       bch2_pr_time_units_aligned(out, f_mean);
-       prt_tab(out);
-       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
-       prt_newline(out);
-
-       prt_printf(out, "stddev:\t");
-       bch2_pr_time_units_aligned(out, f_stddev);
-       prt_tab(out);
-       bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
-
-       printbuf_indent_sub(out, 2);
-       prt_newline(out);
-
-       printbuf_tabstops_reset(out);
-
-       if (quantiles) {
-               int i = eytzinger0_first(NR_QUANTILES);
-               const struct time_unit *u =
-                       bch2_pick_time_units(quantiles->entries[i].m);
-               u64 last_q = 0;
-
-               prt_printf(out, "quantiles (%s):\t", u->name);
-               eytzinger0_for_each(j, NR_QUANTILES) {
-                       bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1;
-
-                       u64 q = max(quantiles->entries[j].m, last_q);
-                       prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
-                       if (is_last)
-                               prt_newline(out);
-                       last_q = q;
-               }
-       }
-}
-
-/* ratelimit: */
-
-/**
- * bch2_ratelimit_delay() - return how long to delay until the next time to do
- *             some work
- * @d:         the struct bch_ratelimit to update
- * Returns:    the amount of time to delay by, in jiffies
- */
-u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
-{
-       u64 now = local_clock();
-
-       return time_after64(d->next, now)
-               ? nsecs_to_jiffies(d->next - now)
-               : 0;
-}
-
-/**
- * bch2_ratelimit_increment() - increment @d by the amount of work done
- * @d:         the struct bch_ratelimit to update
- * @done:      the amount of work done, in arbitrary units
- */
-void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
-{
-       u64 now = local_clock();
-
-       d->next += div_u64(done * NSEC_PER_SEC, d->rate);
-
-       if (time_before64(now + NSEC_PER_SEC, d->next))
-               d->next = now + NSEC_PER_SEC;
-
-       if (time_after64(now - NSEC_PER_SEC * 2, d->next))
-               d->next = now - NSEC_PER_SEC * 2;
-}
-
-/* pd controller: */
-
-/*
- * Updates pd_controller. Attempts to scale inputed values to units per second.
- * @target: desired value
- * @actual: current value
- *
- * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
- * it makes actual go down.
- */
-void bch2_pd_controller_update(struct bch_pd_controller *pd,
-                             s64 target, s64 actual, int sign)
-{
-       s64 proportional, derivative, change;
-
-       unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
-
-       if (seconds_since_update == 0)
-               return;
-
-       pd->last_update = jiffies;
-
-       proportional = actual - target;
-       proportional *= seconds_since_update;
-       proportional = div_s64(proportional, pd->p_term_inverse);
-
-       derivative = actual - pd->last_actual;
-       derivative = div_s64(derivative, seconds_since_update);
-       derivative = ewma_add(pd->smoothed_derivative, derivative,
-                             (pd->d_term / seconds_since_update) ?: 1);
-       derivative = derivative * pd->d_term;
-       derivative = div_s64(derivative, pd->p_term_inverse);
-
-       change = proportional + derivative;
-
-       /* Don't increase rate if not keeping up */
-       if (change > 0 &&
-           pd->backpressure &&
-           time_after64(local_clock(),
-                        pd->rate.next + NSEC_PER_MSEC))
-               change = 0;
-
-       change *= (sign * -1);
-
-       pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
-                               1, UINT_MAX);
-
-       pd->last_actual         = actual;
-       pd->last_derivative     = derivative;
-       pd->last_proportional   = proportional;
-       pd->last_change         = change;
-       pd->last_target         = target;
-}
-
-void bch2_pd_controller_init(struct bch_pd_controller *pd)
-{
-       pd->rate.rate           = 1024;
-       pd->last_update         = jiffies;
-       pd->p_term_inverse      = 6000;
-       pd->d_term              = 30;
-       pd->d_smooth            = pd->d_term;
-       pd->backpressure        = 1;
-}
-
-void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
-{
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 20);
-
-       prt_printf(out, "rate:\t");
-       prt_human_readable_s64(out, pd->rate.rate);
-       prt_newline(out);
-
-       prt_printf(out, "target:\t");
-       prt_human_readable_u64(out, pd->last_target);
-       prt_newline(out);
-
-       prt_printf(out, "actual:\t");
-       prt_human_readable_u64(out, pd->last_actual);
-       prt_newline(out);
-
-       prt_printf(out, "proportional:\t");
-       prt_human_readable_s64(out, pd->last_proportional);
-       prt_newline(out);
-
-       prt_printf(out, "derivative:\t");
-       prt_human_readable_s64(out, pd->last_derivative);
-       prt_newline(out);
-
-       prt_printf(out, "change:\t");
-       prt_human_readable_s64(out, pd->last_change);
-       prt_newline(out);
-
-       prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
-}
-
-/* misc: */
-
-void bch2_bio_map(struct bio *bio, void *base, size_t size)
-{
-       while (size) {
-               struct page *page = is_vmalloc_addr(base)
-                               ? vmalloc_to_page(base)
-                               : virt_to_page(base);
-               unsigned offset = offset_in_page(base);
-               unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
-
-               BUG_ON(!bio_add_page(bio, page, len, offset));
-               size -= len;
-               base += len;
-       }
-}
-
-int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
-{
-       while (size) {
-               struct page *page = alloc_pages(gfp_mask, 0);
-               unsigned len = min_t(size_t, PAGE_SIZE, size);
-
-               if (!page)
-                       return -ENOMEM;
-
-               if (unlikely(!bio_add_page(bio, page, len, 0))) {
-                       __free_page(page);
-                       break;
-               }
-
-               size -= len;
-       }
-
-       return 0;
-}
-
-u64 bch2_get_random_u64_below(u64 ceil)
-{
-       if (ceil <= U32_MAX)
-               return __get_random_u32_below(ceil);
-
-       /* this is the same (clever) algorithm as in __get_random_u32_below() */
-       u64 rand = get_random_u64();
-       u64 mult = ceil * rand;
-
-       if (unlikely(mult < ceil)) {
-               u64 bound;
-               div64_u64_rem(-ceil, ceil, &bound);
-               while (unlikely(mult < bound)) {
-                       rand = get_random_u64();
-                       mult = ceil * rand;
-               }
-       }
-
-       return mul_u64_u64_shr(ceil, rand, 64);
-}
-
-void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
-{
-       struct bio_vec bv;
-       struct bvec_iter iter;
-
-       __bio_for_each_segment(bv, dst, iter, dst_iter) {
-               void *dstp = kmap_local_page(bv.bv_page);
-
-               memcpy(dstp + bv.bv_offset, src, bv.bv_len);
-               kunmap_local(dstp);
-
-               src += bv.bv_len;
-       }
-}
-
-void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
-{
-       struct bio_vec bv;
-       struct bvec_iter iter;
-
-       __bio_for_each_segment(bv, src, iter, src_iter) {
-               void *srcp = kmap_local_page(bv.bv_page);
-
-               memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
-               kunmap_local(srcp);
-
-               dst += bv.bv_len;
-       }
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_corrupt_bio(struct bio *bio)
-{
-       struct bvec_iter iter;
-       struct bio_vec bv;
-       unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
-
-       bio_for_each_segment(bv, bio, iter) {
-               unsigned u64s = bv.bv_len / sizeof(u64);
-
-               if (offset < u64s) {
-                       u64 *segment = bvec_kmap_local(&bv);
-                       segment[offset] = get_random_u64();
-                       kunmap_local(segment);
-                       return;
-               }
-               offset -= u64s;
-       }
-}
-#endif
-
-void bch2_bio_to_text(struct printbuf *out, struct bio *bio)
-{
-       prt_printf(out, "bi_remaining:\t%u\n",
-                  atomic_read(&bio->__bi_remaining));
-       prt_printf(out, "bi_end_io:\t%ps\n",
-                  bio->bi_end_io);
-       prt_printf(out, "bi_status:\t%u\n",
-                  bio->bi_status);
-}
-
-#if 0
-void eytzinger1_test(void)
-{
-       unsigned inorder, size;
-
-       pr_info("1 based eytzinger test:\n");
-
-       for (size = 2;
-            size < 65536;
-            size++) {
-               unsigned extra = eytzinger1_extra(size);
-
-               if (!(size % 4096))
-                       pr_info("tree size %u\n", size);
-
-               inorder = 1;
-               eytzinger1_for_each(eytz, size) {
-                       BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
-                       BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
-                       BUG_ON(eytz != eytzinger1_last(size) &&
-                              eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
-
-                       inorder++;
-               }
-               BUG_ON(inorder - 1 != size);
-       }
-}
-
-void eytzinger0_test(void)
-{
-
-       unsigned inorder, size;
-
-       pr_info("0 based eytzinger test:\n");
-
-       for (size = 1;
-            size < 65536;
-            size++) {
-               unsigned extra = eytzinger0_extra(size);
-
-               if (!(size % 4096))
-                       pr_info("tree size %u\n", size);
-
-               inorder = 0;
-               eytzinger0_for_each(eytz, size) {
-                       BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
-                       BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
-                       BUG_ON(eytz != eytzinger0_last(size) &&
-                              eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
-
-                       inorder++;
-               }
-               BUG_ON(inorder != size);
-
-               inorder = size - 1;
-               eytzinger0_for_each_prev(eytz, size) {
-                       BUG_ON(eytz != eytzinger0_first(size) &&
-                              eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz);
-
-                       inorder--;
-               }
-               BUG_ON(inorder != -1);
-       }
-}
-
-static inline int cmp_u16(const void *_l, const void *_r)
-{
-       const u16 *l = _l, *r = _r;
-
-       return (*l > *r) - (*r > *l);
-}
-
-static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search)
-{
-       int r, s;
-       bool bad;
-
-       r = eytzinger0_find_le(test_array, nr,
-                              sizeof(test_array[0]),
-                              cmp_u16, &search);
-       if (r >= 0) {
-               if (test_array[r] > search) {
-                       bad = true;
-               } else {
-                       s = eytzinger0_next(r, nr);
-                       bad = s >= 0 && test_array[s] <= search;
-               }
-       } else {
-               s = eytzinger0_last(nr);
-               bad = s >= 0 && test_array[s] <= search;
-       }
-
-       if (bad) {
-               s = -1;
-               eytzinger0_for_each_prev(j, nr) {
-                       if (test_array[j] <= search) {
-                               s = j;
-                               break;
-                       }
-               }
-
-               eytzinger0_for_each(j, nr)
-                       pr_info("[%3u] = %12u\n", j, test_array[j]);
-               pr_info("find_le(%12u) = %3i should be %3i\n",
-                       search, r, s);
-               BUG();
-       }
-}
-
-static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search)
-{
-       int r, s;
-       bool bad;
-
-       r = eytzinger0_find_gt(test_array, nr,
-                              sizeof(test_array[0]),
-                              cmp_u16, &search);
-       if (r >= 0) {
-               if (test_array[r] <= search) {
-                       bad = true;
-               } else {
-                       s = eytzinger0_prev(r, nr);
-                       bad = s >= 0 && test_array[s] > search;
-               }
-       } else {
-               s = eytzinger0_first(nr);
-               bad = s >= 0 && test_array[s] > search;
-       }
-
-       if (bad) {
-               s = -1;
-               eytzinger0_for_each(j, nr) {
-                       if (test_array[j] > search) {
-                               s = j;
-                               break;
-                       }
-               }
-
-               eytzinger0_for_each(j, nr)
-                       pr_info("[%3u] = %12u\n", j, test_array[j]);
-               pr_info("find_gt(%12u) = %3i should be %3i\n",
-                       search, r, s);
-               BUG();
-       }
-}
-
-static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search)
-{
-       int r, s;
-       bool bad;
-
-       r = eytzinger0_find_ge(test_array, nr,
-                              sizeof(test_array[0]),
-                              cmp_u16, &search);
-       if (r >= 0) {
-               if (test_array[r] < search) {
-                       bad = true;
-               } else {
-                       s = eytzinger0_prev(r, nr);
-                       bad = s >= 0 && test_array[s] >= search;
-               }
-       } else {
-               s = eytzinger0_first(nr);
-               bad = s >= 0 && test_array[s] >= search;
-       }
-
-       if (bad) {
-               s = -1;
-               eytzinger0_for_each(j, nr) {
-                       if (test_array[j] >= search) {
-                               s = j;
-                               break;
-                       }
-               }
-
-               eytzinger0_for_each(j, nr)
-                       pr_info("[%3u] = %12u\n", j, test_array[j]);
-               pr_info("find_ge(%12u) = %3i should be %3i\n",
-                       search, r, s);
-               BUG();
-       }
-}
-
-static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search)
-{
-       unsigned r;
-       int s;
-       bool bad;
-
-       r = eytzinger0_find(test_array, nr,
-                           sizeof(test_array[0]),
-                           cmp_u16, &search);
-
-       if (r < nr) {
-               bad = test_array[r] != search;
-       } else {
-               s = eytzinger0_find_le(test_array, nr,
-                                      sizeof(test_array[0]),
-                                      cmp_u16, &search);
-               bad = s >= 0 && test_array[s] == search;
-       }
-
-       if (bad) {
-               eytzinger0_for_each(j, nr)
-                       pr_info("[%3u] = %12u\n", j, test_array[j]);
-               pr_info("find(%12u) = %3i is incorrect\n",
-                       search, r);
-               BUG();
-       }
-}
-
-static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
-{
-       eytzinger0_find_test_le(test_array, nr, search);
-       eytzinger0_find_test_gt(test_array, nr, search);
-       eytzinger0_find_test_ge(test_array, nr, search);
-       eytzinger0_find_test_eq(test_array, nr, search);
-}
-
-void eytzinger0_find_test(void)
-{
-       unsigned i, nr, allocated = 1 << 12;
-       u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
-
-       for (nr = 1; nr < allocated; nr++) {
-               u16 prev = 0;
-
-               pr_info("testing %u elems\n", nr);
-
-               get_random_bytes(test_array, nr * sizeof(test_array[0]));
-               eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
-
-               /* verify array is sorted correctly: */
-               eytzinger0_for_each(j, nr) {
-                       BUG_ON(test_array[j] < prev);
-                       prev = test_array[j];
-               }
-
-               for (i = 0; i < U16_MAX; i += 1 << 12)
-                       eytzinger0_find_test_val(test_array, nr, i);
-
-               for (i = 0; i < nr; i++) {
-                       eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
-                       eytzinger0_find_test_val(test_array, nr, test_array[i]);
-                       eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
-               }
-       }
-
-       kfree(test_array);
-}
-#endif
-
-/*
- * Accumulate percpu counters onto one cpu's copy - only valid when access
- * against any percpu counter is guarded against
- */
-u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
-{
-       u64 *ret;
-       int cpu;
-
-       /* access to pcpu vars has to be blocked by other locking */
-       preempt_disable();
-       ret = this_cpu_ptr(p);
-       preempt_enable();
-
-       for_each_possible_cpu(cpu) {
-               u64 *i = per_cpu_ptr(p, cpu);
-
-               if (i != ret) {
-                       acc_u64s(ret, i, nr);
-                       memset(i, 0, nr * sizeof(u64));
-               }
-       }
-
-       return ret;
-}
-
-void bch2_darray_str_exit(darray_const_str *d)
-{
-       darray_for_each(*d, i)
-               kfree(*i);
-       darray_exit(d);
-}
-
-int bch2_split_devs(const char *_dev_name, darray_const_str *ret)
-{
-       darray_init(ret);
-
-       char *dev_name, *s, *orig;
-
-       dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
-       if (!dev_name)
-               return -ENOMEM;
-
-       while ((s = strsep(&dev_name, ":"))) {
-               char *p = kstrdup(s, GFP_KERNEL);
-               if (!p)
-                       goto err;
-
-               if (darray_push(ret, p)) {
-                       kfree(p);
-                       goto err;
-               }
-       }
-
-       kfree(orig);
-       return 0;
-err:
-       bch2_darray_str_exit(ret);
-       kfree(orig);
-       return -ENOMEM;
-}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
deleted file mode 100644 (file)
index 6488f09..0000000
+++ /dev/null
@@ -1,782 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_UTIL_H
-#define _BCACHEFS_UTIL_H
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/closure.h>
-#include <linux/errno.h>
-#include <linux/freezer.h>
-#include <linux/kernel.h>
-#include <linux/min_heap.h>
-#include <linux/sched/clock.h>
-#include <linux/llist.h>
-#include <linux/log2.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/random.h>
-#include <linux/ratelimit.h>
-#include <linux/slab.h>
-#include <linux/sort.h>
-#include <linux/vmalloc.h>
-#include <linux/workqueue.h>
-
-#include "mean_and_variance.h"
-
-#include "darray.h"
-#include "time_stats.h"
-
-struct closure;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define EBUG_ON(cond)          BUG_ON(cond)
-#else
-#define EBUG_ON(cond)
-#endif
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define CPU_BIG_ENDIAN         0
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define CPU_BIG_ENDIAN         1
-#endif
-
-/* type hackery */
-
-#define type_is_exact(_val, _type)                                     \
-       __builtin_types_compatible_p(typeof(_val), _type)
-
-#define type_is(_val, _type)                                           \
-       (__builtin_types_compatible_p(typeof(_val), _type) ||           \
-        __builtin_types_compatible_p(typeof(_val), const _type))
-
-/* Userspace doesn't align allocations as nicely as the kernel allocators: */
-static inline size_t buf_pages(void *p, size_t len)
-{
-       return DIV_ROUND_UP(len +
-                           ((unsigned long) p & (PAGE_SIZE - 1)),
-                           PAGE_SIZE);
-}
-
-static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags)
-{
-       void *p = unlikely(n >= INT_MAX)
-               ? vmalloc_noprof(n)
-               : kvmalloc_noprof(n, flags & ~__GFP_ZERO);
-       if (p && (flags & __GFP_ZERO))
-               memset(p, 0, n);
-       return p;
-}
-#define bch2_kvmalloc(...)                     alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__))
-
-#define init_heap(heap, _size, gfp)                                    \
-({                                                                     \
-       (heap)->nr = 0;                                         \
-       (heap)->size = (_size);                                         \
-       (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
-                                (gfp));                                \
-})
-
-#define free_heap(heap)                                                        \
-do {                                                                   \
-       kvfree((heap)->data);                                           \
-       (heap)->data = NULL;                                            \
-} while (0)
-
-#define ANYSINT_MAX(t)                                                 \
-       ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
-
-#include "printbuf.h"
-
-#define prt_vprintf(_out, ...)         bch2_prt_vprintf(_out, __VA_ARGS__)
-#define prt_printf(_out, ...)          bch2_prt_printf(_out, __VA_ARGS__)
-#define printbuf_str(_buf)             bch2_printbuf_str(_buf)
-#define printbuf_exit(_buf)            bch2_printbuf_exit(_buf)
-
-#define printbuf_tabstops_reset(_buf)  bch2_printbuf_tabstops_reset(_buf)
-#define printbuf_tabstop_pop(_buf)     bch2_printbuf_tabstop_pop(_buf)
-#define printbuf_tabstop_push(_buf, _n)        bch2_printbuf_tabstop_push(_buf, _n)
-
-#define printbuf_indent_add(_out, _n)  bch2_printbuf_indent_add(_out, _n)
-#define printbuf_indent_add_nextline(_out, _n) bch2_printbuf_indent_add_nextline(_out, _n)
-#define printbuf_indent_sub(_out, _n)  bch2_printbuf_indent_sub(_out, _n)
-
-#define prt_newline(_out)              bch2_prt_newline(_out)
-#define prt_tab(_out)                  bch2_prt_tab(_out)
-#define prt_tab_rjust(_out)            bch2_prt_tab_rjust(_out)
-
-#define prt_bytes_indented(...)                bch2_prt_bytes_indented(__VA_ARGS__)
-#define prt_u64(_out, _v)              prt_printf(_out, "%llu", (u64) (_v))
-#define prt_human_readable_u64(...)    bch2_prt_human_readable_u64(__VA_ARGS__)
-#define prt_human_readable_s64(...)    bch2_prt_human_readable_s64(__VA_ARGS__)
-#define prt_units_u64(...)             bch2_prt_units_u64(__VA_ARGS__)
-#define prt_units_s64(...)             bch2_prt_units_s64(__VA_ARGS__)
-#define prt_string_option(...)         bch2_prt_string_option(__VA_ARGS__)
-#define prt_bitflags(...)              bch2_prt_bitflags(__VA_ARGS__)
-#define prt_bitflags_vector(...)       bch2_prt_bitflags_vector(__VA_ARGS__)
-
-void bch2_pr_time_units(struct printbuf *, u64);
-void bch2_prt_datetime(struct printbuf *, time64_t);
-
-#ifdef __KERNEL__
-static inline void uuid_unparse_lower(u8 *uuid, char *out)
-{
-       sprintf(out, "%pUb", uuid);
-}
-#else
-#include <uuid/uuid.h>
-#endif
-
-static inline void pr_uuid(struct printbuf *out, u8 *uuid)
-{
-       char uuid_str[40];
-
-       uuid_unparse_lower(uuid, uuid_str);
-       prt_printf(out, "%s", uuid_str);
-}
-
-int bch2_strtoint_h(const char *, int *);
-int bch2_strtouint_h(const char *, unsigned int *);
-int bch2_strtoll_h(const char *, long long *);
-int bch2_strtoull_h(const char *, unsigned long long *);
-int bch2_strtou64_h(const char *, u64 *);
-
-static inline int bch2_strtol_h(const char *cp, long *res)
-{
-#if BITS_PER_LONG == 32
-       return bch2_strtoint_h(cp, (int *) res);
-#else
-       return bch2_strtoll_h(cp, (long long *) res);
-#endif
-}
-
-static inline int bch2_strtoul_h(const char *cp, long *res)
-{
-#if BITS_PER_LONG == 32
-       return bch2_strtouint_h(cp, (unsigned int *) res);
-#else
-       return bch2_strtoull_h(cp, (unsigned long long *) res);
-#endif
-}
-
-#define strtoi_h(cp, res)                                              \
-       ( type_is(*res, int)            ? bch2_strtoint_h(cp, (void *) res)\
-       : type_is(*res, long)           ? bch2_strtol_h(cp, (void *) res)\
-       : type_is(*res, long long)      ? bch2_strtoll_h(cp, (void *) res)\
-       : type_is(*res, unsigned)       ? bch2_strtouint_h(cp, (void *) res)\
-       : type_is(*res, unsigned long)  ? bch2_strtoul_h(cp, (void *) res)\
-       : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
-       : -EINVAL)
-
-#define strtoul_safe(cp, var)                                          \
-({                                                                     \
-       unsigned long _v;                                               \
-       int _r = kstrtoul(cp, 10, &_v);                                 \
-       if (!_r)                                                        \
-               var = _v;                                               \
-       _r;                                                             \
-})
-
-#define strtoul_safe_clamp(cp, var, min, max)                          \
-({                                                                     \
-       unsigned long _v;                                               \
-       int _r = kstrtoul(cp, 10, &_v);                                 \
-       if (!_r)                                                        \
-               var = clamp_t(typeof(var), _v, min, max);               \
-       _r;                                                             \
-})
-
-#define strtoul_safe_restrict(cp, var, min, max)                       \
-({                                                                     \
-       unsigned long _v;                                               \
-       int _r = kstrtoul(cp, 10, &_v);                                 \
-       if (!_r && _v >= min && _v <= max)                              \
-               var = _v;                                               \
-       else                                                            \
-               _r = -EINVAL;                                           \
-       _r;                                                             \
-})
-
-#define snprint(out, var)                                              \
-       prt_printf(out,                                                 \
-                  type_is(var, int)            ? "%i\n"                \
-                : type_is(var, unsigned)       ? "%u\n"                \
-                : type_is(var, long)           ? "%li\n"               \
-                : type_is(var, unsigned long)  ? "%lu\n"               \
-                : type_is(var, s64)            ? "%lli\n"              \
-                : type_is(var, u64)            ? "%llu\n"              \
-                : type_is(var, char *)         ? "%s\n"                \
-                : "%i\n", var)
-
-bool bch2_is_zero(const void *, size_t);
-
-u64 bch2_read_flag_list(const char *, const char * const[]);
-
-void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
-void bch2_prt_u64_base2(struct printbuf *, u64);
-
-void bch2_print_string_as_lines(const char *, const char *);
-
-typedef DARRAY(unsigned long) bch_stacktrace;
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
-void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
-int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
-
-static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
-{
-#ifdef __KERNEL__
-       prt_printf(out, "%pg", bdev);
-#else
-       prt_str(out, bdev->name);
-#endif
-}
-
-void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-
-#define ewma_add(ewma, val, weight)                                    \
-({                                                                     \
-       typeof(ewma) _ewma = (ewma);                                    \
-       typeof(weight) _weight = (weight);                              \
-                                                                       \
-       (((_ewma << _weight) - _ewma) + (val)) >> _weight;              \
-})
-
-struct bch_ratelimit {
-       /* Next time we want to do some work, in nanoseconds */
-       u64                     next;
-
-       /*
-        * Rate at which we want to do work, in units per nanosecond
-        * The units here correspond to the units passed to
-        * bch2_ratelimit_increment()
-        */
-       unsigned                rate;
-};
-
-static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
-{
-       d->next = local_clock();
-}
-
-u64 bch2_ratelimit_delay(struct bch_ratelimit *);
-void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
-
-struct bch_pd_controller {
-       struct bch_ratelimit    rate;
-       unsigned long           last_update;
-
-       s64                     last_actual;
-       s64                     smoothed_derivative;
-
-       unsigned                p_term_inverse;
-       unsigned                d_smooth;
-       unsigned                d_term;
-
-       /* for exporting to sysfs (no effect on behavior) */
-       s64                     last_derivative;
-       s64                     last_proportional;
-       s64                     last_change;
-       s64                     last_target;
-
-       /*
-        * If true, the rate will not increase if bch2_ratelimit_delay()
-        * is not being called often enough.
-        */
-       bool                    backpressure;
-};
-
-void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
-void bch2_pd_controller_init(struct bch_pd_controller *);
-void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
-
-#define sysfs_pd_controller_attribute(name)                            \
-       rw_attribute(name##_rate);                                      \
-       rw_attribute(name##_rate_bytes);                                \
-       rw_attribute(name##_rate_d_term);                               \
-       rw_attribute(name##_rate_p_term_inverse);                       \
-       read_attribute(name##_rate_debug)
-
-#define sysfs_pd_controller_files(name)                                        \
-       &sysfs_##name##_rate,                                           \
-       &sysfs_##name##_rate_bytes,                                     \
-       &sysfs_##name##_rate_d_term,                                    \
-       &sysfs_##name##_rate_p_term_inverse,                            \
-       &sysfs_##name##_rate_debug
-
-#define sysfs_pd_controller_show(name, var)                            \
-do {                                                                   \
-       sysfs_hprint(name##_rate,               (var)->rate.rate);      \
-       sysfs_print(name##_rate_bytes,          (var)->rate.rate);      \
-       sysfs_print(name##_rate_d_term,         (var)->d_term);         \
-       sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
-                                                                       \
-       if (attr == &sysfs_##name##_rate_debug)                         \
-               bch2_pd_controller_debug_to_text(out, var);             \
-} while (0)
-
-#define sysfs_pd_controller_store(name, var)                           \
-do {                                                                   \
-       sysfs_strtoul_clamp(name##_rate,                                \
-                           (var)->rate.rate, 1, UINT_MAX);             \
-       sysfs_strtoul_clamp(name##_rate_bytes,                          \
-                           (var)->rate.rate, 1, UINT_MAX);             \
-       sysfs_strtoul(name##_rate_d_term,       (var)->d_term);         \
-       sysfs_strtoul_clamp(name##_rate_p_term_inverse,                 \
-                           (var)->p_term_inverse, 1, INT_MAX);         \
-} while (0)
-
-#define container_of_or_null(ptr, type, member)                                \
-({                                                                     \
-       typeof(ptr) _ptr = ptr;                                         \
-       _ptr ? container_of(_ptr, type, member) : NULL;                 \
-})
-
-static inline struct list_head *list_pop(struct list_head *head)
-{
-       if (list_empty(head))
-               return NULL;
-
-       struct list_head *ret = head->next;
-       list_del_init(ret);
-       return ret;
-}
-
-#define list_pop_entry(head, type, member)             \
-       container_of_or_null(list_pop(head), type, member)
-
-/* Does linear interpolation between powers of two */
-static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
-{
-       unsigned fract = x & ~(~0 << fract_bits);
-
-       x >>= fract_bits;
-       x   = 1 << x;
-       x  += (x * fract) >> fract_bits;
-
-       return x;
-}
-
-void bch2_bio_map(struct bio *bio, void *base, size_t);
-int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
-
-#define closure_bio_submit(bio, cl)                                    \
-do {                                                                   \
-       closure_get(cl);                                                \
-       submit_bio(bio);                                                \
-} while (0)
-
-#define kthread_wait(cond)                                             \
-({                                                                     \
-       int _ret = 0;                                                   \
-                                                                       \
-       while (1) {                                                     \
-               set_current_state(TASK_INTERRUPTIBLE);                  \
-               if (kthread_should_stop()) {                            \
-                       _ret = -1;                                      \
-                       break;                                          \
-               }                                                       \
-                                                                       \
-               if (cond)                                               \
-                       break;                                          \
-                                                                       \
-               schedule();                                             \
-       }                                                               \
-       set_current_state(TASK_RUNNING);                                \
-       _ret;                                                           \
-})
-
-#define kthread_wait_freezable(cond)                                   \
-({                                                                     \
-       int _ret = 0;                                                   \
-       while (1) {                                                     \
-               set_current_state(TASK_INTERRUPTIBLE);                  \
-               if (kthread_should_stop()) {                            \
-                       _ret = -1;                                      \
-                       break;                                          \
-               }                                                       \
-                                                                       \
-               if (cond)                                               \
-                       break;                                          \
-                                                                       \
-               schedule();                                             \
-               try_to_freeze();                                        \
-       }                                                               \
-       set_current_state(TASK_RUNNING);                                \
-       _ret;                                                           \
-})
-
-u64 bch2_get_random_u64_below(u64);
-
-void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
-void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_corrupt_bio(struct bio *);
-
-static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
-{
-       if (ratio && !get_random_u32_below(ratio))
-               bch2_corrupt_bio(bio);
-}
-#else
-#define bch2_maybe_corrupt_bio(...)    do {} while (0)
-#endif
-
-void bch2_bio_to_text(struct printbuf *, struct bio *);
-
-static inline void memcpy_u64s_small(void *dst, const void *src,
-                                    unsigned u64s)
-{
-       u64 *d = dst;
-       const u64 *s = src;
-
-       while (u64s--)
-               *d++ = *s++;
-}
-
-static inline void __memcpy_u64s(void *dst, const void *src,
-                                unsigned u64s)
-{
-#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
-       long d0, d1, d2;
-
-       asm volatile("rep ; movsq"
-                    : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-                    : "0" (u64s), "1" (dst), "2" (src)
-                    : "memory");
-#else
-       u64 *d = dst;
-       const u64 *s = src;
-
-       while (u64s--)
-               *d++ = *s++;
-#endif
-}
-
-static inline void memcpy_u64s(void *dst, const void *src,
-                              unsigned u64s)
-{
-       EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
-                dst + u64s * sizeof(u64) <= src));
-
-       __memcpy_u64s(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_down(void *dst, const void *src,
-                                      unsigned u64s)
-{
-       __memcpy_u64s(dst, src, u64s);
-}
-
-static inline void memmove_u64s_down(void *dst, const void *src,
-                                    unsigned u64s)
-{
-       EBUG_ON(dst > src);
-
-       __memmove_u64s_down(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_down_small(void *dst, const void *src,
-                                      unsigned u64s)
-{
-       memcpy_u64s_small(dst, src, u64s);
-}
-
-static inline void memmove_u64s_down_small(void *dst, const void *src,
-                                    unsigned u64s)
-{
-       EBUG_ON(dst > src);
-
-       __memmove_u64s_down_small(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
-                                          unsigned u64s)
-{
-       u64 *dst = (u64 *) _dst + u64s;
-       u64 *src = (u64 *) _src + u64s;
-
-       while (u64s--)
-               *--dst = *--src;
-}
-
-static inline void memmove_u64s_up_small(void *dst, const void *src,
-                                        unsigned u64s)
-{
-       EBUG_ON(dst < src);
-
-       __memmove_u64s_up_small(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_up(void *_dst, const void *_src,
-                                    unsigned u64s)
-{
-       u64 *dst = (u64 *) _dst + u64s - 1;
-       u64 *src = (u64 *) _src + u64s - 1;
-
-#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
-       long d0, d1, d2;
-
-       asm volatile("std ;\n"
-                    "rep ; movsq\n"
-                    "cld ;\n"
-                    : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-                    : "0" (u64s), "1" (dst), "2" (src)
-                    : "memory");
-#else
-       while (u64s--)
-               *dst-- = *src--;
-#endif
-}
-
-static inline void memmove_u64s_up(void *dst, const void *src,
-                                  unsigned u64s)
-{
-       EBUG_ON(dst < src);
-
-       __memmove_u64s_up(dst, src, u64s);
-}
-
-static inline void memmove_u64s(void *dst, const void *src,
-                               unsigned u64s)
-{
-       if (dst < src)
-               __memmove_u64s_down(dst, src, u64s);
-       else
-               __memmove_u64s_up(dst, src, u64s);
-}
-
-/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
-static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
-{
-       unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
-
-       memset(s + bytes, c, rem);
-}
-
-/* just the memmove, doesn't update @_nr */
-#define __array_insert_item(_array, _nr, _pos)                         \
-       memmove(&(_array)[(_pos) + 1],                                  \
-               &(_array)[(_pos)],                                      \
-               sizeof((_array)[0]) * ((_nr) - (_pos)))
-
-#define array_insert_item(_array, _nr, _pos, _new_item)                        \
-do {                                                                   \
-       __array_insert_item(_array, _nr, _pos);                         \
-       (_nr)++;                                                        \
-       (_array)[(_pos)] = (_new_item);                                 \
-} while (0)
-
-#define array_remove_items(_array, _nr, _pos, _nr_to_remove)           \
-do {                                                                   \
-       (_nr) -= (_nr_to_remove);                                       \
-       memmove(&(_array)[(_pos)],                                      \
-               &(_array)[(_pos) + (_nr_to_remove)],                    \
-               sizeof((_array)[0]) * ((_nr) - (_pos)));                \
-} while (0)
-
-#define array_remove_item(_array, _nr, _pos)                           \
-       array_remove_items(_array, _nr, _pos, 1)
-
-static inline void __move_gap(void *array, size_t element_size,
-                             size_t nr, size_t size,
-                             size_t old_gap, size_t new_gap)
-{
-       size_t gap_end = old_gap + size - nr;
-
-       if (new_gap < old_gap) {
-               size_t move = old_gap - new_gap;
-
-               memmove(array + element_size * (gap_end - move),
-                       array + element_size * (old_gap - move),
-                               element_size * move);
-       } else if (new_gap > old_gap) {
-               size_t move = new_gap - old_gap;
-
-               memmove(array + element_size * old_gap,
-                       array + element_size * gap_end,
-                               element_size * move);
-       }
-}
-
-/* Move the gap in a gap buffer: */
-#define move_gap(_d, _new_gap)                                         \
-do {                                                                   \
-       BUG_ON(_new_gap > (_d)->nr);                                    \
-       BUG_ON((_d)->gap > (_d)->nr);                                   \
-                                                                       \
-       __move_gap((_d)->data, sizeof((_d)->data[0]),                   \
-                  (_d)->nr, (_d)->size, (_d)->gap, _new_gap);          \
-       (_d)->gap = _new_gap;                                           \
-} while (0)
-
-#define bubble_sort(_base, _nr, _cmp)                                  \
-do {                                                                   \
-       ssize_t _i, _last;                                              \
-       bool _swapped = true;                                           \
-                                                                       \
-       for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
-               _swapped = false;                                       \
-               for (_i = 0; _i < _last; _i++)                          \
-                       if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {   \
-                               swap((_base)[_i], (_base)[_i + 1]);     \
-                               _swapped = true;                        \
-                       }                                               \
-       }                                                               \
-} while (0)
-
-#define per_cpu_sum(_p)                                                        \
-({                                                                     \
-       TYPEOF_UNQUAL(*_p) _ret = 0;                                    \
-                                                                       \
-       int cpu;                                                        \
-       for_each_possible_cpu(cpu)                                      \
-               _ret += *per_cpu_ptr(_p, cpu);                          \
-       _ret;                                                           \
-})
-
-static inline u64 percpu_u64_get(u64 __percpu *src)
-{
-       return per_cpu_sum(src);
-}
-
-static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               *per_cpu_ptr(dst, cpu) = 0;
-       this_cpu_write(*dst, src);
-}
-
-static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
-{
-       for (unsigned i = 0; i < nr; i++)
-               acc[i] += src[i];
-}
-
-static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
-                                  unsigned nr)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
-}
-
-static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               memset(per_cpu_ptr(p, cpu), c, bytes);
-}
-
-u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
-
-static inline int u8_cmp(u8 l, u8 r)
-{
-       return cmp_int(l, r);
-}
-
-static inline int cmp_le32(__le32 l, __le32 r)
-{
-       return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
-}
-
-#include <linux/uuid.h>
-
-static inline bool qstr_eq(const struct qstr l, const struct qstr r)
-{
-       return l.len == r.len && !memcmp(l.name, r.name, l.len);
-}
-
-void bch2_darray_str_exit(darray_const_str *);
-int bch2_split_devs(const char *, darray_const_str *);
-
-#ifdef __KERNEL__
-
-__must_check
-static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
-{
-       return copy_to_user(to, from, n) ? -EFAULT : 0;
-}
-
-__must_check
-static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
-{
-       return copy_from_user(to, from, n) ? -EFAULT : 0;
-}
-
-#endif
-
-static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
-{
-       if (v)
-               set_bit(nr, addr);
-       else
-               clear_bit(nr, addr);
-}
-
-static inline void __set_bit_le64(size_t bit, __le64 *addr)
-{
-       addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
-}
-
-static inline void __clear_bit_le64(size_t bit, __le64 *addr)
-{
-       addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
-}
-
-static inline bool test_bit_le64(size_t bit, __le64 *addr)
-{
-       return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
-}
-
-static inline void memcpy_swab(void *_dst, void *_src, size_t len)
-{
-       u8 *dst = _dst + len;
-       u8 *src = _src;
-
-       while (len--)
-               *--dst = *src++;
-}
-
-#define set_flags(_map, _in, _out)                                     \
-do {                                                                   \
-       unsigned _i;                                                    \
-                                                                       \
-       for (_i = 0; _i < ARRAY_SIZE(_map); _i++)                       \
-               if ((_in) & (1 << _i))                                  \
-                       (_out) |= _map[_i];                             \
-               else                                                    \
-                       (_out) &= ~_map[_i];                            \
-} while (0)
-
-#define map_flags(_map, _in)                                           \
-({                                                                     \
-       unsigned _out = 0;                                              \
-                                                                       \
-       set_flags(_map, _in, _out);                                     \
-       _out;                                                           \
-})
-
-#define map_flags_rev(_map, _in)                                       \
-({                                                                     \
-       unsigned _i, _out = 0;                                          \
-                                                                       \
-       for (_i = 0; _i < ARRAY_SIZE(_map); _i++)                       \
-               if ((_in) & _map[_i]) {                                 \
-                       (_out) |= 1 << _i;                              \
-                       (_in) &= ~_map[_i];                             \
-               }                                                       \
-       (_out);                                                         \
-})
-
-#define map_defined(_map)                                              \
-({                                                                     \
-       unsigned _in = ~0;                                              \
-                                                                       \
-       map_flags_rev(_map, _in);                                       \
-})
-
-#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
deleted file mode 100644 (file)
index 6620eca..0000000
+++ /dev/null
@@ -1,130 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/bitops.h>
-#include <linux/math.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-#ifdef CONFIG_VALGRIND
-#include <valgrind/memcheck.h>
-#endif
-
-#include "errcode.h"
-#include "varint.h"
-
-/**
- * bch2_varint_encode - encode a variable length integer
- * @out:       destination to encode to
- * @v:         unsigned integer to encode
- * Returns:    size in bytes of the encoded integer - at most 9 bytes
- */
-int bch2_varint_encode(u8 *out, u64 v)
-{
-       unsigned bits = fls64(v|1);
-       unsigned bytes = DIV_ROUND_UP(bits, 7);
-       __le64 v_le;
-
-       if (likely(bytes < 9)) {
-               v <<= bytes;
-               v |= ~(~0 << (bytes - 1));
-               v_le = cpu_to_le64(v);
-               memcpy(out, &v_le, bytes);
-       } else {
-               *out++ = 255;
-               bytes = 9;
-               put_unaligned_le64(v, out);
-       }
-
-       return bytes;
-}
-
-/**
- * bch2_varint_decode - encode a variable length integer
- * @in:                varint to decode
- * @end:       end of buffer to decode from
- * @out:       on success, decoded integer
- * Returns:    size in bytes of the decoded integer - or -1 on failure (would
- * have read past the end of the buffer)
- */
-int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
-{
-       unsigned bytes = likely(in < end)
-               ? ffz(*in & 255) + 1
-               : 1;
-       u64 v;
-
-       if (unlikely(in + bytes > end))
-               return -BCH_ERR_varint_decode_error;
-
-       if (likely(bytes < 9)) {
-               __le64 v_le = 0;
-
-               memcpy(&v_le, in, bytes);
-               v = le64_to_cpu(v_le);
-               v >>= bytes;
-       } else {
-               v = get_unaligned_le64(++in);
-       }
-
-       *out = v;
-       return bytes;
-}
-
-/**
- * bch2_varint_encode_fast - fast version of bch2_varint_encode
- * @out:       destination to encode to
- * @v:         unsigned integer to encode
- * Returns:    size in bytes of the encoded integer - at most 9 bytes
- *
- * This version assumes it's always safe to write 8 bytes to @out, even if the
- * encoded integer would be smaller.
- */
-int bch2_varint_encode_fast(u8 *out, u64 v)
-{
-       unsigned bits = fls64(v|1);
-       unsigned bytes = DIV_ROUND_UP(bits, 7);
-
-       if (likely(bytes < 9)) {
-               v <<= bytes;
-               v |= ~(~0U << (bytes - 1));
-       } else {
-               *out++ = 255;
-               bytes = 9;
-       }
-
-       put_unaligned_le64(v, out);
-       return bytes;
-}
-
-/**
- * bch2_varint_decode_fast - fast version of bch2_varint_decode
- * @in:                varint to decode
- * @end:       end of buffer to decode from
- * @out:       on success, decoded integer
- * Returns:    size in bytes of the decoded integer - or -1 on failure (would
- * have read past the end of the buffer)
- *
- * This version assumes that it is safe to read at most 8 bytes past the end of
- * @end (we still return an error if the varint extends past @end).
- */
-int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
-{
-#ifdef CONFIG_VALGRIND
-       VALGRIND_MAKE_MEM_DEFINED(in, 8);
-#endif
-       u64 v = get_unaligned_le64(in);
-       unsigned bytes = ffz(*in) + 1;
-
-       if (unlikely(in + bytes > end))
-               return -BCH_ERR_varint_decode_error;
-
-       if (likely(bytes < 9)) {
-               v >>= bytes;
-               v &= ~(~0ULL << (7 * bytes));
-       } else {
-               v = get_unaligned_le64(++in);
-       }
-
-       *out = v;
-       return bytes;
-}
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
deleted file mode 100644 (file)
index 92a182f..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_VARINT_H
-#define _BCACHEFS_VARINT_H
-
-int bch2_varint_encode(u8 *, u64);
-int bch2_varint_decode(const u8 *, const u8 *, u64 *);
-
-int bch2_varint_encode_fast(u8 *, u64);
-int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
-
-#endif /* _BCACHEFS_VARINT_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
deleted file mode 100644 (file)
index 2ad338e..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _VSTRUCTS_H
-#define _VSTRUCTS_H
-
-#include "util.h"
-
-/*
- * NOTE: we can't differentiate between __le64 and u64 with type_is - this
- * assumes u64 is little endian:
- */
-#define __vstruct_u64s(_s)                                             \
-({                                                                     \
-       ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)           \
-       : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)           \
-       : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)           \
-       : ((__force u8) ((_s)->u64s)));                                         \
-})
-
-#define __vstruct_bytes(_type, _u64s)                                  \
-({                                                                     \
-       BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));             \
-                                                                       \
-       (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64));      \
-})
-
-#define vstruct_bytes(_s)                                              \
-       __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
-
-#define __vstruct_blocks(_type, _sector_block_bits, _u64s)             \
-       (round_up(__vstruct_bytes(_type, _u64s),                        \
-                 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
-
-#define vstruct_blocks(_s, _sector_block_bits)                         \
-       __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
-
-#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s)             \
-       __vstruct_blocks(typeof(*(_s)), _sector_block_bits,             \
-                        __vstruct_u64s(_s) + (_u64s))
-
-#define vstruct_sectors(_s, _sector_block_bits)                                \
-       (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
-
-#define vstruct_next(_s)                                               \
-       ((typeof(_s))                   ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-#define vstruct_last(_s)                                               \
-       ((typeof(&(_s)->start[0]))      ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-#define vstruct_end(_s)                                                        \
-       ((void *)                       ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-
-#define vstruct_for_each(_s, _i)                                       \
-       for (typeof(&(_s)->start[0]) _i = (_s)->start;                  \
-            _i < vstruct_last(_s);                                     \
-            _i = vstruct_next(_i))
-
-#define vstruct_for_each_safe(_s, _i)                                  \
-       for (typeof(&(_s)->start[0]) _next, _i = (_s)->start;           \
-            _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \
-            _i = _next)
-
-#define vstruct_idx(_s, _idx)                                          \
-       ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
-
-#endif /* _VSTRUCTS_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
deleted file mode 100644 (file)
index 627f153..0000000
+++ /dev/null
@@ -1,642 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "fs.h"
-#include "rebalance.h"
-#include "str_hash.h"
-#include "xattr.h"
-
-#include <linux/dcache.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
-
-static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
-
-static u64 bch2_xattr_hash(const struct bch_hash_info *info,
-                         const struct xattr_search_key *key)
-{
-       struct bch_str_hash_ctx ctx;
-
-       bch2_str_hash_init(&ctx, info);
-       bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
-       bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
-
-       return bch2_str_hash_end(&ctx, info);
-}
-
-static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
-{
-       return bch2_xattr_hash(info, key);
-}
-
-static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
-       struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
-
-       return bch2_xattr_hash(info,
-                &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len));
-}
-
-static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
-{
-       struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
-       const struct xattr_search_key *r = _r;
-
-       return l.v->x_type != r->type ||
-               l.v->x_name_len != r->name.len ||
-               memcmp(l.v->x_name_and_value, r->name.name, r->name.len);
-}
-
-static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
-       struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
-       struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
-
-       return l.v->x_type != r.v->x_type ||
-               l.v->x_name_len != r.v->x_name_len ||
-               memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len);
-}
-
-const struct bch_hash_desc bch2_xattr_hash_desc = {
-       .btree_id       = BTREE_ID_xattrs,
-       .key_type       = KEY_TYPE_xattr,
-       .hash_key       = xattr_hash_key,
-       .hash_bkey      = xattr_hash_bkey,
-       .cmp_key        = xattr_cmp_key,
-       .cmp_bkey       = xattr_cmp_bkey,
-};
-
-int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
-                       struct bkey_validate_context from)
-{
-       struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-       unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
-                                          le16_to_cpu(xattr.v->x_val_len));
-       int ret = 0;
-
-       bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s,
-                        c, xattr_val_size_too_small,
-                        "value too small (%zu < %u)",
-                        bkey_val_u64s(k.k), val_u64s);
-
-       /* XXX why +4 ? */
-       val_u64s = xattr_val_u64s(xattr.v->x_name_len,
-                                 le16_to_cpu(xattr.v->x_val_len) + 4);
-
-       bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s,
-                        c, xattr_val_size_too_big,
-                        "value too big (%zu > %u)",
-                        bkey_val_u64s(k.k), val_u64s);
-
-       bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type),
-                        c, xattr_invalid_type,
-                        "invalid type (%u)", xattr.v->x_type);
-
-       bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len),
-                        c, xattr_name_invalid_chars,
-                        "xattr name has invalid characters");
-fsck_err:
-       return ret;
-}
-
-void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
-                       struct bkey_s_c k)
-{
-       const struct xattr_handler *handler;
-       struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-
-       handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-       if (handler && handler->prefix)
-               prt_printf(out, "%s", handler->prefix);
-       else if (handler)
-               prt_printf(out, "(type %u)", xattr.v->x_type);
-       else
-               prt_printf(out, "(unknown type %u)", xattr.v->x_type);
-
-       unsigned name_len = xattr.v->x_name_len;
-       unsigned val_len  = le16_to_cpu(xattr.v->x_val_len);
-       unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
-               offsetof(struct bch_xattr, x_name_and_value);
-
-       val_len  = min_t(int, val_len, max_name_val_bytes - name_len);
-       name_len = min(name_len, max_name_val_bytes);
-
-       prt_printf(out, "%.*s:%.*s",
-                  name_len, xattr.v->x_name_and_value,
-                  val_len,  (char *) xattr_val(xattr.v));
-
-       if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
-           xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
-               prt_char(out, ' ');
-               bch2_acl_to_text(out, xattr_val(xattr.v),
-                                le16_to_cpu(xattr.v->x_val_len));
-       }
-}
-
-static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
-                               const char *name, void *buffer, size_t size, int type)
-{
-       struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
-       struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
-       struct btree_iter iter;
-       struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
-                                            inode_inum(inode), &search, 0);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-       ret = le16_to_cpu(xattr.v->x_val_len);
-       if (buffer) {
-               if (ret > size)
-                       ret = -ERANGE;
-               else
-                       memcpy(buffer, xattr_val(xattr.v), ret);
-       }
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
-                  struct bch_inode_unpacked *inode_u,
-                  const struct bch_hash_info *hash_info,
-                  const char *name, const void *value, size_t size,
-                  int type, int flags)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter inode_iter = {};
-       int ret;
-
-       ret   = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
-               bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
-       if (ret)
-               return ret;
-
-       /*
-        * Besides the ctime update, extents, dirents and xattrs updates require
-        * that an inode update also happens - to ensure that if a key exists in
-        * one of those btrees with a given snapshot ID an inode is also present
-        */
-       inode_u->bi_ctime = bch2_current_time(c);
-
-       ret = bch2_inode_write(trans, &inode_iter, inode_u);
-       bch2_trans_iter_exit(trans, &inode_iter);
-
-       if (ret)
-               return ret;
-
-       if (value) {
-               struct bkey_i_xattr *xattr;
-               unsigned namelen = strlen(name);
-               unsigned u64s = BKEY_U64s +
-                       xattr_val_u64s(namelen, size);
-
-               if (u64s > U8_MAX)
-                       return -ERANGE;
-
-               xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
-               if (IS_ERR(xattr))
-                       return PTR_ERR(xattr);
-
-               bkey_xattr_init(&xattr->k_i);
-               xattr->k.u64s           = u64s;
-               xattr->v.x_type         = type;
-               xattr->v.x_name_len     = namelen;
-               xattr->v.x_val_len      = cpu_to_le16(size);
-               memcpy(xattr->v.x_name_and_value, name, namelen);
-               memcpy(xattr_val(&xattr->v), value, size);
-
-               ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
-                             inum, &xattr->k_i,
-                             (flags & XATTR_CREATE ? STR_HASH_must_create : 0)|
-                             (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0));
-       } else {
-               struct xattr_search_key search =
-                       X_SEARCH(type, name, strlen(name));
-
-               ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
-                                      hash_info, inum, &search);
-       }
-
-       if (bch2_err_matches(ret, ENOENT))
-               ret = flags & XATTR_REPLACE ? -ENODATA : 0;
-
-       return ret;
-}
-
-struct xattr_buf {
-       char            *buf;
-       size_t          len;
-       size_t          used;
-};
-
-static int __bch2_xattr_emit(const char *prefix,
-                            const char *name, size_t name_len,
-                            struct xattr_buf *buf)
-{
-       const size_t prefix_len = strlen(prefix);
-       const size_t total_len = prefix_len + name_len + 1;
-
-       if (buf->buf) {
-               if (buf->used + total_len > buf->len)
-                       return -ERANGE;
-
-               memcpy(buf->buf + buf->used, prefix, prefix_len);
-               memcpy(buf->buf + buf->used + prefix_len,
-                      name, name_len);
-               buf->buf[buf->used + prefix_len + name_len] = '\0';
-       }
-
-       buf->used += total_len;
-       return 0;
-}
-
-static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
-{
-       const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
-
-       if (!xattr_handler_can_list(handler, dentry))
-               return NULL;
-
-       return xattr_prefix(handler);
-}
-
-static int bch2_xattr_emit(struct dentry *dentry,
-                           const struct bch_xattr *xattr,
-                           struct xattr_buf *buf)
-{
-       const char *prefix;
-
-       prefix = bch2_xattr_prefix(xattr->x_type, dentry);
-       if (!prefix)
-               return 0;
-
-       return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf);
-}
-
-static int bch2_xattr_list_bcachefs(struct bch_fs *c,
-                                   struct bch_inode_unpacked *inode,
-                                   struct xattr_buf *buf,
-                                   bool all)
-{
-       const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
-       unsigned id;
-       int ret = 0;
-       u64 v;
-
-       for (id = 0; id < Inode_opt_nr; id++) {
-               v = bch2_inode_opt_get(inode, id);
-               if (!v)
-                       continue;
-
-               if (!all &&
-                   !(inode->bi_fields_set & (1 << id)))
-                       continue;
-
-               ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
-                                       strlen(bch2_inode_opts[id]), buf);
-               if (ret)
-                       break;
-       }
-
-       return ret;
-}
-
-ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
-       struct bch_fs *c = dentry->d_sb->s_fs_info;
-       struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-       struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
-       u64 offset = 0, inum = inode->ei_inode.bi_inum;
-
-       int ret = bch2_trans_run(c,
-               for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs,
-                                  POS(inum, offset),
-                                  POS(inum, U64_MAX),
-                                  inode->ei_inum.subvol, 0, k, ({
-                       if (k.k->type != KEY_TYPE_xattr)
-                               continue;
-
-                       bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
-               }))) ?:
-               bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
-               bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
-
-       return ret ? bch2_err_class(ret) : buf.used;
-}
-
-static int bch2_xattr_get_handler(const struct xattr_handler *handler,
-                                 struct dentry *dentry, struct inode *vinode,
-                                 const char *name, void *buffer, size_t size)
-{
-       struct bch_inode_info *inode = to_bch_ei(vinode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       int ret = bch2_trans_do(c,
-               bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
-
-       if (ret < 0 && bch2_err_matches(ret, ENOENT))
-               ret = -ENODATA;
-
-       return bch2_err_class(ret);
-}
-
-static int bch2_xattr_set_handler(const struct xattr_handler *handler,
-                                 struct mnt_idmap *idmap,
-                                 struct dentry *dentry, struct inode *vinode,
-                                 const char *name, const void *value,
-                                 size_t size, int flags)
-{
-       struct bch_inode_info *inode = to_bch_ei(vinode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
-       struct bch_inode_unpacked inode_u;
-       int ret;
-
-       ret = bch2_trans_run(c,
-               commit_do(trans, NULL, NULL, 0,
-                       bch2_xattr_set(trans, inode_inum(inode), &inode_u,
-                                      &hash, name, value, size,
-                                      handler->flags, flags)) ?:
-               (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
-
-       return bch2_err_class(ret);
-}
-
-static const struct xattr_handler bch_xattr_user_handler = {
-       .prefix = XATTR_USER_PREFIX,
-       .get    = bch2_xattr_get_handler,
-       .set    = bch2_xattr_set_handler,
-       .flags  = KEY_TYPE_XATTR_INDEX_USER,
-};
-
-static bool bch2_xattr_trusted_list(struct dentry *dentry)
-{
-       return capable(CAP_SYS_ADMIN);
-}
-
-static const struct xattr_handler bch_xattr_trusted_handler = {
-       .prefix = XATTR_TRUSTED_PREFIX,
-       .list   = bch2_xattr_trusted_list,
-       .get    = bch2_xattr_get_handler,
-       .set    = bch2_xattr_set_handler,
-       .flags  = KEY_TYPE_XATTR_INDEX_TRUSTED,
-};
-
-static const struct xattr_handler bch_xattr_security_handler = {
-       .prefix = XATTR_SECURITY_PREFIX,
-       .get    = bch2_xattr_get_handler,
-       .set    = bch2_xattr_set_handler,
-       .flags  = KEY_TYPE_XATTR_INDEX_SECURITY,
-};
-
-#ifndef NO_BCACHEFS_FS
-
-static int opt_to_inode_opt(int id)
-{
-       switch (id) {
-#define x(name, ...)                           \
-       case Opt_##name: return Inode_opt_##name;
-       BCH_INODE_OPTS()
-#undef  x
-       default:
-               return -1;
-       }
-}
-
-static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
-                               struct dentry *dentry, struct inode *vinode,
-                               const char *name, void *buffer, size_t size,
-                               bool all)
-{
-       struct bch_inode_info *inode = to_bch_ei(vinode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_opts opts =
-               bch2_inode_opts_to_opts(&inode->ei_inode);
-       const struct bch_option *opt;
-       int id, inode_opt_id;
-       struct printbuf out = PRINTBUF;
-       int ret;
-       u64 v;
-
-       id = bch2_opt_lookup(name);
-       if (id < 0 || !bch2_opt_is_inode_opt(id))
-               return -EINVAL;
-
-       inode_opt_id = opt_to_inode_opt(id);
-       if (inode_opt_id < 0)
-               return -EINVAL;
-
-       opt = bch2_opt_table + id;
-
-       if (!bch2_opt_defined_by_id(&opts, id))
-               return -ENODATA;
-
-       if (!all &&
-           !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
-               return -ENODATA;
-
-       v = bch2_opt_get_by_id(&opts, id);
-       bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
-
-       ret = out.pos;
-
-       if (out.allocation_failure) {
-               ret = -ENOMEM;
-       } else if (buffer) {
-               if (out.pos > size)
-                       ret = -ERANGE;
-               else
-                       memcpy(buffer, out.buf, out.pos);
-       }
-
-       printbuf_exit(&out);
-       return ret;
-}
-
-static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
-                                  struct dentry *dentry, struct inode *vinode,
-                                  const char *name, void *buffer, size_t size)
-{
-       return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
-                                        name, buffer, size, false);
-}
-
-struct inode_opt_set {
-       int                     id;
-       u64                     v;
-       bool                    defined;
-};
-
-static int inode_opt_set_fn(struct btree_trans *trans,
-                           struct bch_inode_info *inode,
-                           struct bch_inode_unpacked *bi,
-                           void *p)
-{
-       struct inode_opt_set *s = p;
-
-       if (s->id == Inode_opt_casefold) {
-               int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v);
-               if (ret)
-                       return ret;
-       }
-
-       if (s->defined)
-               bi->bi_fields_set |= 1U << s->id;
-       else
-               bi->bi_fields_set &= ~(1U << s->id);
-
-       bch2_inode_opt_set(bi, s->id, s->v);
-
-       return 0;
-}
-
-static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
-                                  struct mnt_idmap *idmap,
-                                  struct dentry *dentry, struct inode *vinode,
-                                  const char *name, const void *value,
-                                  size_t size, int flags)
-{
-       struct bch_inode_info *inode = to_bch_ei(vinode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       const struct bch_option *opt;
-       char *buf;
-       struct inode_opt_set s;
-       int opt_id, inode_opt_id, ret;
-
-       opt_id = bch2_opt_lookup(name);
-       if (opt_id < 0)
-               return -EINVAL;
-
-       opt = bch2_opt_table + opt_id;
-
-       inode_opt_id = opt_to_inode_opt(opt_id);
-       if (inode_opt_id < 0)
-               return -EINVAL;
-
-       s.id = inode_opt_id;
-
-       if (value) {
-               u64 v = 0;
-
-               buf = kmalloc(size + 1, GFP_KERNEL);
-               if (!buf)
-                       return -ENOMEM;
-               memcpy(buf, value, size);
-               buf[size] = '\0';
-
-               ret = bch2_opt_parse(c, opt, buf, &v, NULL);
-               kfree(buf);
-
-               if (ret < 0)
-                       goto err_class_exit;
-
-               ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v);
-               if (ret < 0)
-                       goto err_class_exit;
-
-               s.v = v + 1;
-               s.defined = true;
-       } else {
-               /*
-                * Check if this option was set on the parent - if so, switched
-                * back to inheriting from the parent:
-                *
-                * rename() also has to deal with keeping inherited options up
-                * to date - see bch2_reinherit_attrs()
-                */
-               spin_lock(&dentry->d_lock);
-               if (!IS_ROOT(dentry)) {
-                       struct bch_inode_info *dir =
-                               to_bch_ei(d_inode(dentry->d_parent));
-
-                       s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
-               } else {
-                       s.v = 0;
-               }
-               spin_unlock(&dentry->d_lock);
-
-               s.defined = false;
-       }
-
-       mutex_lock(&inode->ei_update_lock);
-       if (inode_opt_id == Inode_opt_project) {
-               /*
-                * inode fields accessible via the xattr interface are stored
-                * with a +1 bias, so that 0 means unset:
-                */
-               ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
-               if (ret)
-                       goto err;
-       }
-
-       ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
-err:
-       mutex_unlock(&inode->ei_update_lock);
-err_class_exit:
-       return bch2_err_class(ret);
-}
-
-static const struct xattr_handler bch_xattr_bcachefs_handler = {
-       .prefix = "bcachefs.",
-       .get    = bch2_xattr_bcachefs_get,
-       .set    = bch2_xattr_bcachefs_set,
-};
-
-static int bch2_xattr_bcachefs_get_effective(
-                               const struct xattr_handler *handler,
-                               struct dentry *dentry, struct inode *vinode,
-                               const char *name, void *buffer, size_t size)
-{
-       return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
-                                        name, buffer, size, true);
-}
-
-/* Noop - xattrs in the bcachefs_effective namespace are inherited */
-static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler,
-                                  struct mnt_idmap *idmap,
-                                  struct dentry *dentry, struct inode *vinode,
-                                  const char *name, const void *value,
-                                  size_t size, int flags)
-{
-       return 0;
-}
-
-static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
-       .prefix = "bcachefs_effective.",
-       .get    = bch2_xattr_bcachefs_get_effective,
-       .set    = bch2_xattr_bcachefs_set_effective,
-};
-
-#endif /* NO_BCACHEFS_FS */
-
-const struct xattr_handler * const bch2_xattr_handlers[] = {
-       &bch_xattr_user_handler,
-       &bch_xattr_trusted_handler,
-       &bch_xattr_security_handler,
-#ifndef NO_BCACHEFS_FS
-       &bch_xattr_bcachefs_handler,
-       &bch_xattr_bcachefs_effective_handler,
-#endif
-       NULL
-};
-
-static const struct xattr_handler *bch_xattr_handler_map[] = {
-       [KEY_TYPE_XATTR_INDEX_USER]                     = &bch_xattr_user_handler,
-       [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] =
-               &nop_posix_acl_access,
-       [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT]        =
-               &nop_posix_acl_default,
-       [KEY_TYPE_XATTR_INDEX_TRUSTED]          = &bch_xattr_trusted_handler,
-       [KEY_TYPE_XATTR_INDEX_SECURITY]         = &bch_xattr_security_handler,
-};
-
-static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
-{
-       return type < ARRAY_SIZE(bch_xattr_handler_map)
-               ? bch_xattr_handler_map[type]
-               : NULL;
-}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
deleted file mode 100644 (file)
index 1139bf3..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_XATTR_H
-#define _BCACHEFS_XATTR_H
-
-#include "str_hash.h"
-
-extern const struct bch_hash_desc bch2_xattr_hash_desc;
-
-int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c,
-                       struct bkey_validate_context);
-void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_xattr ((struct bkey_ops) {       \
-       .key_validate   = bch2_xattr_validate,          \
-       .val_to_text    = bch2_xattr_to_text,           \
-       .min_val_size   = 8,                            \
-})
-
-static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
-{
-       return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) +
-                           name_len + val_len, sizeof(u64));
-}
-
-#define xattr_val(_xattr)                                      \
-       ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len)
-
-struct xattr_search_key {
-       u8              type;
-       struct qstr     name;
-};
-
-#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key)        \
-       { .type = _type, .name = QSTR_INIT(_name, _len) })
-
-struct dentry;
-struct xattr_handler;
-struct bch_hash_info;
-struct bch_inode_info;
-
-/* Exported for cmd_migrate.c in tools: */
-int bch2_xattr_set(struct btree_trans *, subvol_inum,
-                  struct bch_inode_unpacked *, const struct bch_hash_info *,
-                  const char *, const void *, size_t, int, int);
-
-ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
-
-extern const struct xattr_handler * const bch2_xattr_handlers[];
-
-#endif /* _BCACHEFS_XATTR_H */
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
deleted file mode 100644 (file)
index 4121b78..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_XATTR_FORMAT_H
-#define _BCACHEFS_XATTR_FORMAT_H
-
-#define KEY_TYPE_XATTR_INDEX_USER              0
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS  1
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
-#define KEY_TYPE_XATTR_INDEX_TRUSTED           3
-#define KEY_TYPE_XATTR_INDEX_SECURITY          4
-
-struct bch_xattr {
-       struct bch_val          v;
-       __u8                    x_type;
-       __u8                    x_name_len;
-       __le16                  x_val_len;
-       /*
-        * x_name contains the name and value counted by
-        * x_name_len + x_val_len. The introduction of
-        * __counted_by(x_name_len) previously caused a false positive
-        * detection of an out of bounds write.
-        */
-       __u8                    x_name_and_value[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_XATTR_FORMAT_H */